kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
//! Cache management MCP tools.
|
|
2
|
+
|
|
3
|
+
use crate::{cache, mcp::errors::map_kreuzberg_error_to_mcp};
|
|
4
|
+
use rmcp::{
|
|
5
|
+
ErrorData as McpError,
|
|
6
|
+
handler::server::wrapper::Parameters,
|
|
7
|
+
model::{CallToolResult, Content, RawContent},
|
|
8
|
+
tool,
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
/// MCP tool methods for cache management.
|
|
12
|
+
pub(in crate::mcp) trait CacheTool {
|
|
13
|
+
/// Get cache statistics.
|
|
14
|
+
///
|
|
15
|
+
/// This tool returns statistics about the cache including total files, size, and disk space.
|
|
16
|
+
#[tool(
|
|
17
|
+
description = "Get cache statistics including total files, size, and available disk space.",
|
|
18
|
+
annotations(title = "Cache Stats", read_only_hint = true, idempotent_hint = true)
|
|
19
|
+
)]
|
|
20
|
+
fn cache_stats(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
|
|
21
|
+
let cache_dir = std::env::current_dir()
|
|
22
|
+
.unwrap_or_else(|_| std::path::PathBuf::from("."))
|
|
23
|
+
.join(".kreuzberg");
|
|
24
|
+
|
|
25
|
+
let stats = cache::get_cache_metadata(cache_dir.to_str().unwrap_or(".")).map_err(map_kreuzberg_error_to_mcp)?;
|
|
26
|
+
|
|
27
|
+
let response = format!(
|
|
28
|
+
"Cache Statistics\n\
|
|
29
|
+
================\n\
|
|
30
|
+
Directory: {}\n\
|
|
31
|
+
Total files: {}\n\
|
|
32
|
+
Total size: {:.2} MB\n\
|
|
33
|
+
Available space: {:.2} MB\n\
|
|
34
|
+
Oldest file age: {:.2} days\n\
|
|
35
|
+
Newest file age: {:.2} days",
|
|
36
|
+
cache_dir.to_string_lossy(),
|
|
37
|
+
stats.total_files,
|
|
38
|
+
stats.total_size_mb,
|
|
39
|
+
stats.available_space_mb,
|
|
40
|
+
stats.oldest_file_age_days,
|
|
41
|
+
stats.newest_file_age_days
|
|
42
|
+
);
|
|
43
|
+
|
|
44
|
+
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/// Clear the cache.
|
|
48
|
+
///
|
|
49
|
+
/// This tool removes all cached files and returns the number of files removed and space freed.
|
|
50
|
+
#[tool(
|
|
51
|
+
description = "Clear all cached files. Returns the number of files removed and space freed in MB.",
|
|
52
|
+
annotations(title = "Clear Cache", destructive_hint = true)
|
|
53
|
+
)]
|
|
54
|
+
fn cache_clear(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
|
|
55
|
+
let cache_dir = std::env::current_dir()
|
|
56
|
+
.unwrap_or_else(|_| std::path::PathBuf::from("."))
|
|
57
|
+
.join(".kreuzberg");
|
|
58
|
+
|
|
59
|
+
let (removed_files, freed_mb) =
|
|
60
|
+
cache::clear_cache_directory(cache_dir.to_str().unwrap_or(".")).map_err(map_kreuzberg_error_to_mcp)?;
|
|
61
|
+
|
|
62
|
+
let response = format!(
|
|
63
|
+
"Cache cleared successfully\n\
|
|
64
|
+
Directory: {}\n\
|
|
65
|
+
Removed files: {}\n\
|
|
66
|
+
Freed space: {:.2} MB",
|
|
67
|
+
cache_dir.to_string_lossy(),
|
|
68
|
+
removed_files,
|
|
69
|
+
freed_mb
|
|
70
|
+
);
|
|
71
|
+
|
|
72
|
+
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
#[cfg(test)]
|
|
77
|
+
mod tests {
|
|
78
|
+
use super::*;
|
|
79
|
+
|
|
80
|
+
// Simple test struct for trait implementation
|
|
81
|
+
struct TestMcpServer;
|
|
82
|
+
|
|
83
|
+
impl CacheTool for TestMcpServer {}
|
|
84
|
+
|
|
85
|
+
#[tokio::test]
|
|
86
|
+
async fn test_cache_stats_returns_statistics() {
|
|
87
|
+
let server = TestMcpServer;
|
|
88
|
+
|
|
89
|
+
let result = server.cache_stats(Parameters(()));
|
|
90
|
+
|
|
91
|
+
assert!(result.is_ok());
|
|
92
|
+
let call_result = result.unwrap();
|
|
93
|
+
if let Some(content) = call_result.content.first() {
|
|
94
|
+
match &content.raw {
|
|
95
|
+
RawContent::Text(text) => {
|
|
96
|
+
assert!(text.text.contains("Cache Statistics"));
|
|
97
|
+
assert!(text.text.contains("Directory:"));
|
|
98
|
+
assert!(text.text.contains("Total files:"));
|
|
99
|
+
assert!(text.text.contains("Total size:"));
|
|
100
|
+
assert!(text.text.contains("Available space:"));
|
|
101
|
+
}
|
|
102
|
+
_ => panic!("Expected text content"),
|
|
103
|
+
}
|
|
104
|
+
} else {
|
|
105
|
+
panic!("Expected content in result");
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
#[tokio::test]
|
|
110
|
+
async fn test_cache_clear_returns_result() {
|
|
111
|
+
let server = TestMcpServer;
|
|
112
|
+
|
|
113
|
+
let result = server.cache_clear(Parameters(()));
|
|
114
|
+
|
|
115
|
+
assert!(result.is_ok());
|
|
116
|
+
let call_result = result.unwrap();
|
|
117
|
+
if let Some(content) = call_result.content.first() {
|
|
118
|
+
match &content.raw {
|
|
119
|
+
RawContent::Text(text) => {
|
|
120
|
+
assert!(text.text.contains("Cache cleared"));
|
|
121
|
+
assert!(text.text.contains("Directory:"));
|
|
122
|
+
assert!(text.text.contains("Removed files:"));
|
|
123
|
+
assert!(text.text.contains("Freed space:"));
|
|
124
|
+
}
|
|
125
|
+
_ => panic!("Expected text content"),
|
|
126
|
+
}
|
|
127
|
+
} else {
|
|
128
|
+
panic!("Expected content in result");
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
#[tokio::test]
|
|
133
|
+
async fn test_cache_clear_is_idempotent() {
|
|
134
|
+
let server = TestMcpServer;
|
|
135
|
+
|
|
136
|
+
let result1 = server.cache_clear(Parameters(()));
|
|
137
|
+
assert!(result1.is_ok());
|
|
138
|
+
|
|
139
|
+
let result2 = server.cache_clear(Parameters(()));
|
|
140
|
+
assert!(result2.is_ok());
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
#[tokio::test]
|
|
144
|
+
async fn test_cache_clear_returns_metrics() {
|
|
145
|
+
let server = TestMcpServer;
|
|
146
|
+
|
|
147
|
+
let result = server.cache_clear(Parameters(()));
|
|
148
|
+
|
|
149
|
+
assert!(result.is_ok());
|
|
150
|
+
let call_result = result.unwrap();
|
|
151
|
+
if let Some(content) = call_result.content.first()
|
|
152
|
+
&& let RawContent::Text(text) = &content.raw
|
|
153
|
+
{
|
|
154
|
+
assert!(text.text.contains("Removed files:"));
|
|
155
|
+
assert!(text.text.contains("Freed space:"));
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[tokio::test]
|
|
160
|
+
async fn test_cache_stats_returns_valid_data() {
|
|
161
|
+
let server = TestMcpServer;
|
|
162
|
+
|
|
163
|
+
let result = server.cache_stats(Parameters(()));
|
|
164
|
+
|
|
165
|
+
assert!(result.is_ok());
|
|
166
|
+
let call_result = result.unwrap();
|
|
167
|
+
if let Some(content) = call_result.content.first()
|
|
168
|
+
&& let RawContent::Text(text) = &content.raw
|
|
169
|
+
{
|
|
170
|
+
assert!(text.text.contains("Cache Statistics"));
|
|
171
|
+
assert!(text.text.contains("Directory:"));
|
|
172
|
+
assert!(text.text.contains("Total files:"));
|
|
173
|
+
assert!(text.text.contains("Total size:"));
|
|
174
|
+
assert!(text.text.contains("Available space:"));
|
|
175
|
+
assert!(text.text.contains("Oldest file age:"));
|
|
176
|
+
assert!(text.text.contains("Newest file age:"));
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
//! Document extraction MCP tools.
|
|
2
|
+
|
|
3
|
+
use base64::prelude::*;
|
|
4
|
+
use crate::{
|
|
5
|
+
ExtractionConfig, batch_extract_file, batch_extract_file_sync, extract_bytes, extract_bytes_sync, extract_file,
|
|
6
|
+
extract_file_sync, mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
|
|
7
|
+
mcp::params::{BatchExtractFilesParams, ExtractBytesParams, ExtractFileParams},
|
|
8
|
+
};
|
|
9
|
+
use rmcp::{
|
|
10
|
+
ErrorData as McpError,
|
|
11
|
+
handler::server::wrapper::Parameters,
|
|
12
|
+
model::{CallToolResult, Content, RawContent},
|
|
13
|
+
tool,
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
/// MCP tool methods for document extraction.
|
|
17
|
+
pub(in crate::mcp) trait ExtractionTool {
|
|
18
|
+
/// Get reference to default config
|
|
19
|
+
fn default_config(&self) -> &std::sync::Arc<ExtractionConfig>;
|
|
20
|
+
|
|
21
|
+
/// Extract content from a file.
|
|
22
|
+
///
|
|
23
|
+
/// This tool extracts text, metadata, and tables from documents in various formats
|
|
24
|
+
/// including PDFs, Word documents, Excel spreadsheets, images (with OCR), and more.
|
|
25
|
+
#[tool(
|
|
26
|
+
description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more.",
|
|
27
|
+
annotations(title = "Extract File", read_only_hint = true, idempotent_hint = true)
|
|
28
|
+
)]
|
|
29
|
+
async fn extract_file(
|
|
30
|
+
&self,
|
|
31
|
+
Parameters(params): Parameters<ExtractFileParams>,
|
|
32
|
+
) -> Result<CallToolResult, McpError> {
|
|
33
|
+
let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
|
|
34
|
+
|
|
35
|
+
let result = if params.r#async {
|
|
36
|
+
extract_file(¶ms.path, params.mime_type.as_deref(), &config)
|
|
37
|
+
.await
|
|
38
|
+
.map_err(map_kreuzberg_error_to_mcp)?
|
|
39
|
+
} else {
|
|
40
|
+
extract_file_sync(¶ms.path, params.mime_type.as_deref(), &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
let response = format_extraction_result(&result);
|
|
44
|
+
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/// Extract content from base64-encoded bytes.
|
|
48
|
+
///
|
|
49
|
+
/// This tool extracts text, metadata, and tables from base64-encoded document data.
|
|
50
|
+
#[tool(
|
|
51
|
+
description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables.",
|
|
52
|
+
annotations(title = "Extract Bytes", read_only_hint = true, idempotent_hint = true)
|
|
53
|
+
)]
|
|
54
|
+
async fn extract_bytes(
|
|
55
|
+
&self,
|
|
56
|
+
Parameters(params): Parameters<ExtractBytesParams>,
|
|
57
|
+
) -> Result<CallToolResult, McpError> {
|
|
58
|
+
let bytes = BASE64_STANDARD
|
|
59
|
+
.decode(¶ms.data)
|
|
60
|
+
.map_err(|e| McpError::invalid_params(format!("Invalid base64: {}", e), None))?;
|
|
61
|
+
|
|
62
|
+
let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
|
|
63
|
+
|
|
64
|
+
let mime_type = params.mime_type.as_deref().unwrap_or("");
|
|
65
|
+
|
|
66
|
+
let result = if params.r#async {
|
|
67
|
+
extract_bytes(&bytes, mime_type, &config)
|
|
68
|
+
.await
|
|
69
|
+
.map_err(map_kreuzberg_error_to_mcp)?
|
|
70
|
+
} else {
|
|
71
|
+
extract_bytes_sync(&bytes, mime_type, &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
let response = format_extraction_result(&result);
|
|
75
|
+
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/// Extract content from multiple files in parallel.
|
|
79
|
+
///
|
|
80
|
+
/// This tool efficiently processes multiple documents simultaneously, useful for batch operations.
|
|
81
|
+
#[tool(
|
|
82
|
+
description = "Extract content from multiple files in parallel. Returns results for all files.",
|
|
83
|
+
annotations(title = "Batch Extract Files", read_only_hint = true, idempotent_hint = true)
|
|
84
|
+
)]
|
|
85
|
+
async fn batch_extract_files(
|
|
86
|
+
&self,
|
|
87
|
+
Parameters(params): Parameters<BatchExtractFilesParams>,
|
|
88
|
+
) -> Result<CallToolResult, McpError> {
|
|
89
|
+
let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
|
|
90
|
+
|
|
91
|
+
let results = if params.r#async {
|
|
92
|
+
batch_extract_file(params.paths.clone(), &config)
|
|
93
|
+
.await
|
|
94
|
+
.map_err(map_kreuzberg_error_to_mcp)?
|
|
95
|
+
} else {
|
|
96
|
+
batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
let mut response = String::new();
|
|
100
|
+
for (i, result) in results.iter().enumerate() {
|
|
101
|
+
response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
|
|
102
|
+
response.push_str(&format_extraction_result(result));
|
|
103
|
+
response.push_str("\n\n");
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
#[cfg(test)]
|
|
111
|
+
mod tests {
|
|
112
|
+
use super::*;
|
|
113
|
+
use std::path::PathBuf;
|
|
114
|
+
|
|
115
|
+
/// Get the path to a test document relative to workspace root.
|
|
116
|
+
fn get_test_path(relative_path: &str) -> String {
|
|
117
|
+
let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
|
118
|
+
.parent()
|
|
119
|
+
.unwrap()
|
|
120
|
+
.parent()
|
|
121
|
+
.unwrap()
|
|
122
|
+
.to_path_buf();
|
|
123
|
+
|
|
124
|
+
workspace_root
|
|
125
|
+
.join("test_documents")
|
|
126
|
+
.join(relative_path)
|
|
127
|
+
.to_string_lossy()
|
|
128
|
+
.to_string()
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Simple test struct for trait implementation
|
|
132
|
+
struct TestMcpServer {
|
|
133
|
+
config: std::sync::Arc<ExtractionConfig>,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
impl TestMcpServer {
|
|
137
|
+
fn new() -> Self {
|
|
138
|
+
Self {
|
|
139
|
+
config: std::sync::Arc::new(ExtractionConfig::default()),
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
impl ExtractionTool for TestMcpServer {
|
|
145
|
+
fn default_config(&self) -> &std::sync::Arc<ExtractionConfig> {
|
|
146
|
+
&self.config
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
#[tokio::test]
|
|
151
|
+
async fn test_extract_file_sync_with_valid_pdf() {
|
|
152
|
+
let server = TestMcpServer::new();
|
|
153
|
+
let params = ExtractFileParams {
|
|
154
|
+
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
155
|
+
mime_type: None,
|
|
156
|
+
enable_ocr: false,
|
|
157
|
+
force_ocr: false,
|
|
158
|
+
r#async: true,
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
let result = server.extract_file(Parameters(params)).await;
|
|
162
|
+
|
|
163
|
+
assert!(result.is_ok());
|
|
164
|
+
let call_result = result.unwrap();
|
|
165
|
+
if let Some(content) = call_result.content.first() {
|
|
166
|
+
match &content.raw {
|
|
167
|
+
RawContent::Text(text) => {
|
|
168
|
+
assert!(!text.text.is_empty());
|
|
169
|
+
assert!(text.text.contains("Content"));
|
|
170
|
+
}
|
|
171
|
+
_ => panic!("Expected text content"),
|
|
172
|
+
}
|
|
173
|
+
} else {
|
|
174
|
+
panic!("Expected content in result");
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
#[tokio::test]
|
|
179
|
+
async fn test_extract_file_async_with_valid_pdf() {
|
|
180
|
+
let server = TestMcpServer::new();
|
|
181
|
+
let params = ExtractFileParams {
|
|
182
|
+
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
183
|
+
mime_type: None,
|
|
184
|
+
enable_ocr: false,
|
|
185
|
+
force_ocr: false,
|
|
186
|
+
r#async: true,
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
let result = server.extract_file(Parameters(params)).await;
|
|
190
|
+
|
|
191
|
+
assert!(result.is_ok());
|
|
192
|
+
let call_result = result.unwrap();
|
|
193
|
+
if let Some(content) = call_result.content.first() {
|
|
194
|
+
match &content.raw {
|
|
195
|
+
RawContent::Text(text) => {
|
|
196
|
+
assert!(!text.text.is_empty());
|
|
197
|
+
}
|
|
198
|
+
_ => panic!("Expected text content"),
|
|
199
|
+
}
|
|
200
|
+
} else {
|
|
201
|
+
panic!("Expected content in result");
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
#[tokio::test]
|
|
206
|
+
async fn test_extract_file_with_invalid_path() {
|
|
207
|
+
let server = TestMcpServer::new();
|
|
208
|
+
let params = ExtractFileParams {
|
|
209
|
+
path: "/nonexistent/file.pdf".to_string(),
|
|
210
|
+
mime_type: None,
|
|
211
|
+
enable_ocr: false,
|
|
212
|
+
force_ocr: false,
|
|
213
|
+
r#async: true,
|
|
214
|
+
};
|
|
215
|
+
|
|
216
|
+
let result = server.extract_file(Parameters(params)).await;
|
|
217
|
+
|
|
218
|
+
assert!(result.is_err());
|
|
219
|
+
let error = result.unwrap_err();
|
|
220
|
+
assert!(error.code.0 == -32602 || error.code.0 == -32603);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
#[tokio::test]
|
|
224
|
+
async fn test_extract_file_with_mime_type_hint() {
|
|
225
|
+
let server = TestMcpServer::new();
|
|
226
|
+
let params = ExtractFileParams {
|
|
227
|
+
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
228
|
+
mime_type: Some("application/pdf".to_string()),
|
|
229
|
+
enable_ocr: false,
|
|
230
|
+
force_ocr: false,
|
|
231
|
+
r#async: true,
|
|
232
|
+
};
|
|
233
|
+
|
|
234
|
+
let result = server.extract_file(Parameters(params)).await;
|
|
235
|
+
|
|
236
|
+
assert!(result.is_ok());
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
#[tokio::test]
|
|
240
|
+
async fn test_extract_bytes_sync_with_valid_data() {
|
|
241
|
+
let server = TestMcpServer::new();
|
|
242
|
+
|
|
243
|
+
let text_content = b"Hello, world!";
|
|
244
|
+
let encoded = BASE64_STANDARD.encode(text_content);
|
|
245
|
+
|
|
246
|
+
let params = ExtractBytesParams {
|
|
247
|
+
data: encoded,
|
|
248
|
+
mime_type: Some("text/plain".to_string()),
|
|
249
|
+
enable_ocr: false,
|
|
250
|
+
force_ocr: false,
|
|
251
|
+
r#async: true,
|
|
252
|
+
};
|
|
253
|
+
|
|
254
|
+
let result = server.extract_bytes(Parameters(params)).await;
|
|
255
|
+
|
|
256
|
+
assert!(result.is_ok());
|
|
257
|
+
let call_result = result.unwrap();
|
|
258
|
+
if let Some(content) = call_result.content.first() {
|
|
259
|
+
match &content.raw {
|
|
260
|
+
RawContent::Text(text) => {
|
|
261
|
+
assert!(text.text.contains("Hello, world!"));
|
|
262
|
+
}
|
|
263
|
+
_ => panic!("Expected text content"),
|
|
264
|
+
}
|
|
265
|
+
} else {
|
|
266
|
+
panic!("Expected content in result");
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
#[tokio::test]
|
|
271
|
+
async fn test_extract_bytes_with_invalid_base64() {
|
|
272
|
+
let server = TestMcpServer::new();
|
|
273
|
+
|
|
274
|
+
let params = ExtractBytesParams {
|
|
275
|
+
data: "not-valid-base64!!!".to_string(),
|
|
276
|
+
mime_type: None,
|
|
277
|
+
enable_ocr: false,
|
|
278
|
+
force_ocr: false,
|
|
279
|
+
r#async: true,
|
|
280
|
+
};
|
|
281
|
+
|
|
282
|
+
let result = server.extract_bytes(Parameters(params)).await;
|
|
283
|
+
|
|
284
|
+
assert!(result.is_err());
|
|
285
|
+
let error = result.unwrap_err();
|
|
286
|
+
assert_eq!(error.code.0, -32602);
|
|
287
|
+
assert!(error.message.contains("Invalid base64"));
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
#[tokio::test]
|
|
291
|
+
async fn test_batch_extract_files_sync_with_valid_files() {
|
|
292
|
+
let server = TestMcpServer::new();
|
|
293
|
+
let params = BatchExtractFilesParams {
|
|
294
|
+
paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
|
|
295
|
+
enable_ocr: false,
|
|
296
|
+
force_ocr: false,
|
|
297
|
+
r#async: true,
|
|
298
|
+
};
|
|
299
|
+
|
|
300
|
+
let result = server.batch_extract_files(Parameters(params)).await;
|
|
301
|
+
|
|
302
|
+
assert!(result.is_ok());
|
|
303
|
+
let call_result = result.unwrap();
|
|
304
|
+
if let Some(content) = call_result.content.first() {
|
|
305
|
+
match &content.raw {
|
|
306
|
+
RawContent::Text(text) => {
|
|
307
|
+
assert!(text.text.contains("Document 1"));
|
|
308
|
+
assert!(text.text.contains("tiny.pdf"));
|
|
309
|
+
}
|
|
310
|
+
_ => panic!("Expected text content"),
|
|
311
|
+
}
|
|
312
|
+
} else {
|
|
313
|
+
panic!("Expected content in result");
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
#[tokio::test]
|
|
318
|
+
async fn test_batch_extract_files_with_empty_list() {
|
|
319
|
+
let server = TestMcpServer::new();
|
|
320
|
+
let params = BatchExtractFilesParams {
|
|
321
|
+
paths: vec![],
|
|
322
|
+
enable_ocr: false,
|
|
323
|
+
force_ocr: false,
|
|
324
|
+
r#async: true,
|
|
325
|
+
};
|
|
326
|
+
|
|
327
|
+
let result = server.batch_extract_files(Parameters(params)).await;
|
|
328
|
+
|
|
329
|
+
assert!(result.is_ok());
|
|
330
|
+
let call_result = result.unwrap();
|
|
331
|
+
if let Some(content) = call_result.content.first() {
|
|
332
|
+
match &content.raw {
|
|
333
|
+
RawContent::Text(text) => {
|
|
334
|
+
assert!(text.text.is_empty() || text.text.trim().is_empty());
|
|
335
|
+
}
|
|
336
|
+
_ => panic!("Expected text content"),
|
|
337
|
+
}
|
|
338
|
+
} else {
|
|
339
|
+
panic!("Expected content in result");
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
#[tokio::test]
|
|
344
|
+
async fn test_response_includes_metadata() {
|
|
345
|
+
let server = TestMcpServer::new();
|
|
346
|
+
|
|
347
|
+
let test_file = get_test_path("pdfs_with_tables/tiny.pdf");
|
|
348
|
+
|
|
349
|
+
if std::path::Path::new(&test_file).exists() {
|
|
350
|
+
let params = ExtractFileParams {
|
|
351
|
+
path: test_file.to_string(),
|
|
352
|
+
mime_type: None,
|
|
353
|
+
enable_ocr: false,
|
|
354
|
+
force_ocr: false,
|
|
355
|
+
r#async: true,
|
|
356
|
+
};
|
|
357
|
+
|
|
358
|
+
let result = server.extract_file(Parameters(params)).await;
|
|
359
|
+
|
|
360
|
+
assert!(result.is_ok());
|
|
361
|
+
let call_result = result.unwrap();
|
|
362
|
+
|
|
363
|
+
if let Some(content) = call_result.content.first()
|
|
364
|
+
&& let RawContent::Text(text) = &content.raw
|
|
365
|
+
{
|
|
366
|
+
assert!(text.text.contains("Metadata:"));
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
#[tokio::test]
|
|
372
|
+
async fn test_batch_extract_preserves_file_order() {
|
|
373
|
+
let server = TestMcpServer::new();
|
|
374
|
+
|
|
375
|
+
let file1 = get_test_path("pdfs_with_tables/tiny.pdf");
|
|
376
|
+
let file2 = get_test_path("pdfs_with_tables/medium.pdf");
|
|
377
|
+
|
|
378
|
+
if std::path::Path::new(&file1).exists() && std::path::Path::new(&file2).exists() {
|
|
379
|
+
let params = BatchExtractFilesParams {
|
|
380
|
+
paths: vec![file1.to_string(), file2.to_string()],
|
|
381
|
+
enable_ocr: false,
|
|
382
|
+
force_ocr: false,
|
|
383
|
+
r#async: true,
|
|
384
|
+
};
|
|
385
|
+
|
|
386
|
+
let result = server.batch_extract_files(Parameters(params)).await;
|
|
387
|
+
|
|
388
|
+
if let Ok(call_result) = result
|
|
389
|
+
&& let Some(content) = call_result.content.first()
|
|
390
|
+
&& let RawContent::Text(text) = &content.raw
|
|
391
|
+
{
|
|
392
|
+
assert!(text.text.contains("Document 1"));
|
|
393
|
+
assert!(text.text.contains("Document 2"));
|
|
394
|
+
|
|
395
|
+
let doc1_pos = text.text.find("Document 1");
|
|
396
|
+
let doc2_pos = text.text.find("Document 2");
|
|
397
|
+
if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
|
|
398
|
+
assert!(pos1 < pos2, "Documents should be in order");
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|