kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
//! use kreuzberg::mcp::start_mcp_server;
|
|
17
17
|
//!
|
|
18
18
|
//! #[tokio::main]
|
|
19
|
-
//! async fn main() -> Result<()
|
|
19
|
+
//! async fn main() -> anyhow::Result<()> {
|
|
20
20
|
//! start_mcp_server().await?;
|
|
21
21
|
//! Ok(())
|
|
22
22
|
//! }
|
|
@@ -26,9 +26,6 @@ mod server;
|
|
|
26
26
|
|
|
27
27
|
pub use server::{start_mcp_server, start_mcp_server_with_config};
|
|
28
28
|
|
|
29
|
-
#[cfg(feature = "mcp-http")]
|
|
30
|
-
pub use server::{start_mcp_server_http, start_mcp_server_http_with_config};
|
|
31
|
-
|
|
32
29
|
pub use server::{BatchExtractFilesParams, DetectMimeTypeParams, ExtractBytesParams, ExtractFileParams, KreuzbergMcp};
|
|
33
30
|
|
|
34
31
|
#[doc(hidden)]
|
|
@@ -12,9 +12,6 @@ use rmcp::{
|
|
|
12
12
|
transport::stdio,
|
|
13
13
|
};
|
|
14
14
|
|
|
15
|
-
#[cfg(feature = "mcp-http")]
|
|
16
|
-
use rmcp::transport::streamable_http_server::{StreamableHttpService, session::local::LocalSessionManager};
|
|
17
|
-
|
|
18
15
|
use crate::{
|
|
19
16
|
ExtractionConfig, ExtractionResult as KreuzbergResult, KreuzbergError, batch_extract_file, batch_extract_file_sync,
|
|
20
17
|
cache, detect_mime_type, extract_bytes, extract_bytes_sync, extract_file, extract_file_sync,
|
|
@@ -229,8 +226,7 @@ impl KreuzbergMcp {
|
|
|
229
226
|
/// This tool extracts text, metadata, and tables from documents in various formats
|
|
230
227
|
/// including PDFs, Word documents, Excel spreadsheets, images (with OCR), and more.
|
|
231
228
|
#[tool(
|
|
232
|
-
description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more."
|
|
233
|
-
annotations(title = "Extract File", read_only_hint = true, idempotent_hint = true)
|
|
229
|
+
description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more."
|
|
234
230
|
)]
|
|
235
231
|
async fn extract_file(
|
|
236
232
|
&self,
|
|
@@ -254,8 +250,7 @@ impl KreuzbergMcp {
|
|
|
254
250
|
///
|
|
255
251
|
/// This tool extracts text, metadata, and tables from base64-encoded document data.
|
|
256
252
|
#[tool(
|
|
257
|
-
description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables."
|
|
258
|
-
annotations(title = "Extract Bytes", read_only_hint = true, idempotent_hint = true)
|
|
253
|
+
description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables."
|
|
259
254
|
)]
|
|
260
255
|
async fn extract_bytes(
|
|
261
256
|
&self,
|
|
@@ -284,10 +279,7 @@ impl KreuzbergMcp {
|
|
|
284
279
|
/// Extract content from multiple files in parallel.
|
|
285
280
|
///
|
|
286
281
|
/// This tool efficiently processes multiple documents simultaneously, useful for batch operations.
|
|
287
|
-
#[tool(
|
|
288
|
-
description = "Extract content from multiple files in parallel. Returns results for all files.",
|
|
289
|
-
annotations(title = "Batch Extract Files", read_only_hint = true, idempotent_hint = true)
|
|
290
|
-
)]
|
|
282
|
+
#[tool(description = "Extract content from multiple files in parallel. Returns results for all files.")]
|
|
291
283
|
async fn batch_extract_files(
|
|
292
284
|
&self,
|
|
293
285
|
Parameters(params): Parameters<BatchExtractFilesParams>,
|
|
@@ -315,10 +307,7 @@ impl KreuzbergMcp {
|
|
|
315
307
|
/// Detect the MIME type of a file.
|
|
316
308
|
///
|
|
317
309
|
/// This tool identifies the file format, useful for determining which extractor to use.
|
|
318
|
-
#[tool(
|
|
319
|
-
description = "Detect the MIME type of a file. Returns the detected MIME type string.",
|
|
320
|
-
annotations(title = "Detect MIME Type", read_only_hint = true, idempotent_hint = true)
|
|
321
|
-
)]
|
|
310
|
+
#[tool(description = "Detect the MIME type of a file. Returns the detected MIME type string.")]
|
|
322
311
|
fn detect_mime_type(
|
|
323
312
|
&self,
|
|
324
313
|
Parameters(params): Parameters<DetectMimeTypeParams>,
|
|
@@ -331,10 +320,7 @@ impl KreuzbergMcp {
|
|
|
331
320
|
/// Get cache statistics.
|
|
332
321
|
///
|
|
333
322
|
/// This tool returns statistics about the cache including total files, size, and disk space.
|
|
334
|
-
#[tool(
|
|
335
|
-
description = "Get cache statistics including total files, size, and available disk space.",
|
|
336
|
-
annotations(title = "Cache Stats", read_only_hint = true, idempotent_hint = true)
|
|
337
|
-
)]
|
|
323
|
+
#[tool(description = "Get cache statistics including total files, size, and available disk space.")]
|
|
338
324
|
fn cache_stats(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
|
|
339
325
|
let cache_dir = std::env::current_dir()
|
|
340
326
|
.unwrap_or_else(|_| std::path::PathBuf::from("."))
|
|
@@ -365,10 +351,7 @@ impl KreuzbergMcp {
|
|
|
365
351
|
/// Clear the cache.
|
|
366
352
|
///
|
|
367
353
|
/// This tool removes all cached files and returns the number of files removed and space freed.
|
|
368
|
-
#[tool(
|
|
369
|
-
description = "Clear all cached files. Returns the number of files removed and space freed in MB.",
|
|
370
|
-
annotations(title = "Clear Cache", destructive_hint = true)
|
|
371
|
-
)]
|
|
354
|
+
#[tool(description = "Clear all cached files. Returns the number of files removed and space freed in MB.")]
|
|
372
355
|
fn cache_clear(&self, Parameters(_): Parameters<()>) -> Result<CallToolResult, McpError> {
|
|
373
356
|
let cache_dir = std::env::current_dir()
|
|
374
357
|
.unwrap_or_else(|_| std::path::PathBuf::from("."))
|
|
@@ -445,12 +428,12 @@ impl Default for KreuzbergMcp {
|
|
|
445
428
|
/// use kreuzberg::mcp::start_mcp_server;
|
|
446
429
|
///
|
|
447
430
|
/// #[tokio::main]
|
|
448
|
-
/// async fn main() -> Result<()
|
|
431
|
+
/// async fn main() -> anyhow::Result<()> {
|
|
449
432
|
/// start_mcp_server().await?;
|
|
450
433
|
/// Ok(())
|
|
451
434
|
/// }
|
|
452
435
|
/// ```
|
|
453
|
-
pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error
|
|
436
|
+
pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error>> {
|
|
454
437
|
let service = KreuzbergMcp::new()?.serve(stdio()).await?;
|
|
455
438
|
|
|
456
439
|
service.waiting().await?;
|
|
@@ -461,118 +444,13 @@ pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error + Send +
|
|
|
461
444
|
///
|
|
462
445
|
/// This variant allows specifying a custom extraction configuration
|
|
463
446
|
/// (e.g., loaded from a file) instead of using defaults.
|
|
464
|
-
pub async fn start_mcp_server_with_config(
|
|
465
|
-
config: ExtractionConfig,
|
|
466
|
-
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
447
|
+
pub async fn start_mcp_server_with_config(config: ExtractionConfig) -> Result<(), Box<dyn std::error::Error>> {
|
|
467
448
|
let service = KreuzbergMcp::with_config(config).serve(stdio()).await?;
|
|
468
449
|
|
|
469
450
|
service.waiting().await?;
|
|
470
451
|
Ok(())
|
|
471
452
|
}
|
|
472
453
|
|
|
473
|
-
/// Start MCP server with HTTP Stream transport.
|
|
474
|
-
///
|
|
475
|
-
/// Uses rmcp's built-in StreamableHttpService for HTTP/SSE support per MCP spec.
|
|
476
|
-
///
|
|
477
|
-
/// # Arguments
|
|
478
|
-
///
|
|
479
|
-
/// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
480
|
-
/// * `port` - Port number (e.g., 8001)
|
|
481
|
-
///
|
|
482
|
-
/// # Example
|
|
483
|
-
///
|
|
484
|
-
/// ```no_run
|
|
485
|
-
/// use kreuzberg::mcp::start_mcp_server_http;
|
|
486
|
-
///
|
|
487
|
-
/// #[tokio::main]
|
|
488
|
-
/// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
489
|
-
/// start_mcp_server_http("127.0.0.1", 8001).await?;
|
|
490
|
-
/// Ok(())
|
|
491
|
-
/// }
|
|
492
|
-
/// ```
|
|
493
|
-
#[cfg(feature = "mcp-http")]
|
|
494
|
-
pub async fn start_mcp_server_http(
|
|
495
|
-
host: impl AsRef<str>,
|
|
496
|
-
port: u16,
|
|
497
|
-
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
498
|
-
use axum::Router;
|
|
499
|
-
use std::net::SocketAddr;
|
|
500
|
-
|
|
501
|
-
let http_service = StreamableHttpService::new(
|
|
502
|
-
|| KreuzbergMcp::new().map_err(|e| std::io::Error::other(e.to_string())),
|
|
503
|
-
LocalSessionManager::default().into(),
|
|
504
|
-
Default::default(),
|
|
505
|
-
);
|
|
506
|
-
|
|
507
|
-
let router = Router::new().nest_service("/mcp", http_service);
|
|
508
|
-
|
|
509
|
-
let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
|
|
510
|
-
.parse()
|
|
511
|
-
.map_err(|e| format!("Invalid address: {}", e))?;
|
|
512
|
-
|
|
513
|
-
#[cfg(feature = "api")]
|
|
514
|
-
tracing::info!("Starting MCP HTTP server on http://{}", addr);
|
|
515
|
-
|
|
516
|
-
let listener = tokio::net::TcpListener::bind(addr).await?;
|
|
517
|
-
axum::serve(listener, router).await?;
|
|
518
|
-
|
|
519
|
-
Ok(())
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
/// Start MCP HTTP server with custom extraction config.
|
|
523
|
-
///
|
|
524
|
-
/// This variant allows specifying a custom extraction configuration
|
|
525
|
-
/// while using HTTP Stream transport.
|
|
526
|
-
///
|
|
527
|
-
/// # Arguments
|
|
528
|
-
///
|
|
529
|
-
/// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
530
|
-
/// * `port` - Port number (e.g., 8001)
|
|
531
|
-
/// * `config` - Custom extraction configuration
|
|
532
|
-
///
|
|
533
|
-
/// # Example
|
|
534
|
-
///
|
|
535
|
-
/// ```no_run
|
|
536
|
-
/// use kreuzberg::mcp::start_mcp_server_http_with_config;
|
|
537
|
-
/// use kreuzberg::ExtractionConfig;
|
|
538
|
-
///
|
|
539
|
-
/// #[tokio::main]
|
|
540
|
-
/// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
541
|
-
/// let config = ExtractionConfig::default();
|
|
542
|
-
/// start_mcp_server_http_with_config("127.0.0.1", 8001, config).await?;
|
|
543
|
-
/// Ok(())
|
|
544
|
-
/// }
|
|
545
|
-
/// ```
|
|
546
|
-
#[cfg(feature = "mcp-http")]
|
|
547
|
-
pub async fn start_mcp_server_http_with_config(
|
|
548
|
-
host: impl AsRef<str>,
|
|
549
|
-
port: u16,
|
|
550
|
-
config: ExtractionConfig,
|
|
551
|
-
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
552
|
-
use axum::Router;
|
|
553
|
-
use std::net::SocketAddr;
|
|
554
|
-
|
|
555
|
-
let http_service = StreamableHttpService::new(
|
|
556
|
-
move || Ok(KreuzbergMcp::with_config(config.clone())),
|
|
557
|
-
LocalSessionManager::default().into(),
|
|
558
|
-
Default::default(),
|
|
559
|
-
);
|
|
560
|
-
|
|
561
|
-
let router = Router::new().nest_service("/mcp", http_service);
|
|
562
|
-
|
|
563
|
-
let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
|
|
564
|
-
.parse()
|
|
565
|
-
.map_err(|e| format!("Invalid address: {}", e))?;
|
|
566
|
-
|
|
567
|
-
#[cfg(feature = "api")]
|
|
568
|
-
tracing::info!("Starting MCP HTTP server on http://{}", addr);
|
|
569
|
-
|
|
570
|
-
let listener = tokio::net::TcpListener::bind(addr).await?;
|
|
571
|
-
axum::serve(listener, router).await?;
|
|
572
|
-
|
|
573
|
-
Ok(())
|
|
574
|
-
}
|
|
575
|
-
|
|
576
454
|
/// Build extraction config from MCP parameters.
|
|
577
455
|
///
|
|
578
456
|
/// Starts with the default config and overlays OCR settings from request parameters.
|
|
@@ -870,7 +748,6 @@ mod tests {
|
|
|
870
748
|
detected_languages: None,
|
|
871
749
|
chunks: None,
|
|
872
750
|
images: None,
|
|
873
|
-
pages: None,
|
|
874
751
|
};
|
|
875
752
|
|
|
876
753
|
let formatted = format_extraction_result(&result);
|
|
@@ -907,7 +784,6 @@ mod tests {
|
|
|
907
784
|
detected_languages: None,
|
|
908
785
|
chunks: None,
|
|
909
786
|
images: None,
|
|
910
|
-
pages: None,
|
|
911
787
|
};
|
|
912
788
|
|
|
913
789
|
let formatted = format_extraction_result(&result);
|
|
@@ -929,7 +805,6 @@ mod tests {
|
|
|
929
805
|
detected_languages: None,
|
|
930
806
|
chunks: None,
|
|
931
807
|
images: None,
|
|
932
|
-
pages: None,
|
|
933
808
|
};
|
|
934
809
|
|
|
935
810
|
let formatted = format_extraction_result(&result);
|
|
@@ -948,7 +823,6 @@ mod tests {
|
|
|
948
823
|
detected_languages: None,
|
|
949
824
|
chunks: None,
|
|
950
825
|
images: None,
|
|
951
|
-
pages: None,
|
|
952
826
|
};
|
|
953
827
|
|
|
954
828
|
let formatted = format_extraction_result(&result);
|
|
@@ -1746,17 +1620,19 @@ mod tests {
|
|
|
1746
1620
|
|
|
1747
1621
|
let result = server.batch_extract_files(Parameters(params)).await;
|
|
1748
1622
|
|
|
1749
|
-
if
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1623
|
+
if result.is_ok() {
|
|
1624
|
+
let call_result = result.unwrap();
|
|
1625
|
+
if let Some(content) = call_result.content.first()
|
|
1626
|
+
&& let RawContent::Text(text) = &content.raw
|
|
1627
|
+
{
|
|
1628
|
+
assert!(text.text.contains("Document 1"));
|
|
1629
|
+
assert!(text.text.contains("Document 2"));
|
|
1755
1630
|
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1631
|
+
let doc1_pos = text.text.find("Document 1");
|
|
1632
|
+
let doc2_pos = text.text.find("Document 2");
|
|
1633
|
+
if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
|
|
1634
|
+
assert!(pos1 < pos2, "Documents should be in order");
|
|
1635
|
+
}
|
|
1760
1636
|
}
|
|
1761
1637
|
}
|
|
1762
1638
|
}
|
|
@@ -40,7 +40,6 @@
|
|
|
40
40
|
pub mod cache;
|
|
41
41
|
pub mod error;
|
|
42
42
|
pub mod hocr;
|
|
43
|
-
pub mod language_registry;
|
|
44
43
|
pub mod processor;
|
|
45
44
|
pub mod table;
|
|
46
45
|
pub mod tesseract_backend;
|
|
@@ -51,7 +50,6 @@ pub mod validation;
|
|
|
51
50
|
pub use cache::{OcrCache, OcrCacheStats};
|
|
52
51
|
pub use error::OcrError;
|
|
53
52
|
pub use hocr::convert_hocr_to_markdown;
|
|
54
|
-
pub use language_registry::LanguageRegistry;
|
|
55
53
|
pub use processor::OcrProcessor;
|
|
56
54
|
pub use table::{HocrWord, extract_words_from_tsv, reconstruct_table, table_to_markdown};
|
|
57
55
|
pub use tesseract_backend::TesseractBackend;
|
|
@@ -51,14 +51,6 @@ impl OcrProcessor {
|
|
|
51
51
|
Ok(Self { cache })
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
55
|
-
skip(self, image_bytes),
|
|
56
|
-
fields(
|
|
57
|
-
ocr.backend = "tesseract",
|
|
58
|
-
ocr.language = %config.language,
|
|
59
|
-
image.size_bytes = image_bytes.len(),
|
|
60
|
-
)
|
|
61
|
-
))]
|
|
62
54
|
pub fn process_image(&self, image_bytes: &[u8], config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
|
|
63
55
|
config.validate().map_err(OcrError::InvalidConfiguration)?;
|
|
64
56
|
|
|
@@ -72,14 +64,9 @@ impl OcrProcessor {
|
|
|
72
64
|
if config.use_cache
|
|
73
65
|
&& let Some(cached_result) = self.cache.get_cached_result(&image_hash, "tesseract", &config_str)?
|
|
74
66
|
{
|
|
75
|
-
#[cfg(feature = "otel")]
|
|
76
|
-
tracing::Span::current().record("cache.hit", true);
|
|
77
67
|
return Ok(cached_result);
|
|
78
68
|
}
|
|
79
69
|
|
|
80
|
-
#[cfg(feature = "otel")]
|
|
81
|
-
tracing::Span::current().record("cache.hit", false);
|
|
82
|
-
|
|
83
70
|
let result = self.perform_ocr(image_bytes, config)?;
|
|
84
71
|
|
|
85
72
|
if config.use_cache {
|
|
@@ -241,6 +228,7 @@ impl OcrProcessor {
|
|
|
241
228
|
});
|
|
242
229
|
|
|
243
230
|
// Validate language before initializing to prevent segfault ~keep
|
|
231
|
+
// tesseract-rs can crash on empty language or missing language files
|
|
244
232
|
if config.language.trim().is_empty() {
|
|
245
233
|
return Err(OcrError::TesseractInitializationFailed(
|
|
246
234
|
"Language cannot be empty. Please specify a valid language code (e.g., 'eng')".to_string(),
|
|
@@ -248,6 +236,7 @@ impl OcrProcessor {
|
|
|
248
236
|
}
|
|
249
237
|
|
|
250
238
|
// Validate language file exists before initializing to prevent segfault ~keep
|
|
239
|
+
// tesseract-rs can crash if language file is missing instead of returning error
|
|
251
240
|
if !tessdata_path.is_empty() {
|
|
252
241
|
let languages: Vec<&str> = config.language.split('+').collect();
|
|
253
242
|
for lang in languages {
|
|
@@ -373,11 +362,6 @@ impl OcrProcessor {
|
|
|
373
362
|
)
|
|
374
363
|
});
|
|
375
364
|
|
|
376
|
-
api.recognize()
|
|
377
|
-
.map_err(|e| OcrError::ProcessingFailed(format!("Failed to recognize text: {}", e)))?;
|
|
378
|
-
|
|
379
|
-
log_ci_debug(ci_debug_enabled, "recognize", || "completed".to_string());
|
|
380
|
-
|
|
381
365
|
let tsv_data_for_tables = if config.enable_table_detection || config.output_format == "tsv" {
|
|
382
366
|
Some(
|
|
383
367
|
api.get_tsv_text(0)
|
|
@@ -453,7 +437,12 @@ impl OcrProcessor {
|
|
|
453
437
|
let words = extract_words_from_tsv(&tsv_data, config.table_min_confidence)?;
|
|
454
438
|
|
|
455
439
|
if !words.is_empty() {
|
|
456
|
-
let table = reconstruct_table(
|
|
440
|
+
let table = reconstruct_table(
|
|
441
|
+
&words,
|
|
442
|
+
config.table_column_threshold,
|
|
443
|
+
config.table_row_threshold_ratio,
|
|
444
|
+
true,
|
|
445
|
+
);
|
|
457
446
|
if !table.is_empty() {
|
|
458
447
|
metadata.insert("table_count".to_string(), serde_json::Value::String("1".to_string()));
|
|
459
448
|
metadata.insert(
|
|
@@ -161,7 +161,6 @@ impl OcrBackend for TesseractBackend {
|
|
|
161
161
|
content: ocr_result.content,
|
|
162
162
|
mime_type: ocr_result.mime_type,
|
|
163
163
|
metadata,
|
|
164
|
-
pages: None,
|
|
165
164
|
tables: ocr_result
|
|
166
165
|
.tables
|
|
167
166
|
.into_iter()
|
|
@@ -215,7 +214,6 @@ impl OcrBackend for TesseractBackend {
|
|
|
215
214
|
content: ocr_result.content,
|
|
216
215
|
mime_type: ocr_result.mime_type,
|
|
217
216
|
metadata,
|
|
218
|
-
pages: None,
|
|
219
217
|
tables: ocr_result
|
|
220
218
|
.tables
|
|
221
219
|
.into_iter()
|
|
@@ -10,8 +10,6 @@ pub enum PdfError {
|
|
|
10
10
|
TextExtractionFailed(String),
|
|
11
11
|
RenderingFailed(String),
|
|
12
12
|
MetadataExtractionFailed(String),
|
|
13
|
-
ExtractionFailed(String),
|
|
14
|
-
FontLoadingFailed(String),
|
|
15
13
|
IOError(String),
|
|
16
14
|
}
|
|
17
15
|
|
|
@@ -30,8 +28,6 @@ impl fmt::Display for PdfError {
|
|
|
30
28
|
PdfError::MetadataExtractionFailed(msg) => {
|
|
31
29
|
write!(f, "Metadata extraction failed: {}", msg)
|
|
32
30
|
}
|
|
33
|
-
PdfError::ExtractionFailed(msg) => write!(f, "Extraction failed: {}", msg),
|
|
34
|
-
PdfError::FontLoadingFailed(msg) => write!(f, "Font loading failed: {}", msg),
|
|
35
31
|
PdfError::IOError(msg) => write!(f, "I/O error: {}", msg),
|
|
36
32
|
}
|
|
37
33
|
}
|
|
@@ -44,7 +40,7 @@ impl std::error::Error for PdfError {}
|
|
|
44
40
|
impl From<lopdf::Error> for PdfError {
|
|
45
41
|
fn from(err: lopdf::Error) -> Self {
|
|
46
42
|
match err {
|
|
47
|
-
lopdf::Error::IO(
|
|
43
|
+
lopdf::Error::IO(_) => panic!("lopdf IO errors should not be converted to PdfError - let them bubble up"),
|
|
48
44
|
_ => PdfError::InvalidPdf(err.to_string()),
|
|
49
45
|
}
|
|
50
46
|
}
|
|
@@ -52,30 +48,6 @@ impl From<lopdf::Error> for PdfError {
|
|
|
52
48
|
|
|
53
49
|
pub type Result<T> = std::result::Result<T, PdfError>;
|
|
54
50
|
|
|
55
|
-
/// Format a pdfium error for display.
|
|
56
|
-
///
|
|
57
|
-
/// The kreuzberg-pdfium-render fork's error type doesn't implement Display,
|
|
58
|
-
/// so Debug formatting produces messages like "PdfiumLibraryInternalError(FormatError,)"
|
|
59
|
-
/// with trailing commas and parentheses. This function cleans up the formatting.
|
|
60
|
-
pub(crate) fn format_pdfium_error<E: std::fmt::Debug>(error: E) -> String {
|
|
61
|
-
let debug_msg = format!("{:?}", error);
|
|
62
|
-
|
|
63
|
-
if let Some(paren_idx) = debug_msg.find('(') {
|
|
64
|
-
let variant = &debug_msg[..paren_idx];
|
|
65
|
-
let inner = &debug_msg[paren_idx + 1..];
|
|
66
|
-
|
|
67
|
-
let inner_clean = inner.trim_end_matches(')').trim_end_matches(',');
|
|
68
|
-
|
|
69
|
-
if inner_clean.is_empty() {
|
|
70
|
-
variant.to_string()
|
|
71
|
-
} else {
|
|
72
|
-
format!("{}: {}", variant, inner_clean)
|
|
73
|
-
}
|
|
74
|
-
} else {
|
|
75
|
-
debug_msg
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
51
|
#[cfg(test)]
|
|
80
52
|
mod tests {
|
|
81
53
|
use super::*;
|
|
@@ -147,68 +119,4 @@ mod tests {
|
|
|
147
119
|
let err2 = err1.clone();
|
|
148
120
|
assert_eq!(err1.to_string(), err2.to_string());
|
|
149
121
|
}
|
|
150
|
-
|
|
151
|
-
#[test]
|
|
152
|
-
fn test_extraction_failed_error() {
|
|
153
|
-
let err = PdfError::ExtractionFailed("page data mismatch".to_string());
|
|
154
|
-
assert_eq!(err.to_string(), "Extraction failed: page data mismatch");
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
#[test]
|
|
158
|
-
fn test_font_loading_failed_error() {
|
|
159
|
-
let err = PdfError::FontLoadingFailed("missing font file".to_string());
|
|
160
|
-
assert_eq!(err.to_string(), "Font loading failed: missing font file");
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
#[test]
|
|
164
|
-
fn test_format_pdfium_error_with_inner_value() {
|
|
165
|
-
#[derive(Debug)]
|
|
166
|
-
#[allow(dead_code)]
|
|
167
|
-
struct MockError(String);
|
|
168
|
-
|
|
169
|
-
let error = MockError("FormatError,".to_string());
|
|
170
|
-
let formatted = format_pdfium_error(error);
|
|
171
|
-
assert!(formatted.contains("MockError"));
|
|
172
|
-
assert!(formatted.contains("FormatError"));
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
#[test]
|
|
176
|
-
fn test_format_pdfium_error_simple() {
|
|
177
|
-
#[derive(Debug)]
|
|
178
|
-
struct SimpleError;
|
|
179
|
-
|
|
180
|
-
let formatted = format_pdfium_error(SimpleError);
|
|
181
|
-
assert_eq!(formatted, "SimpleError");
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
#[test]
|
|
185
|
-
fn test_format_pdfium_error_empty_inner() {
|
|
186
|
-
#[derive(Debug)]
|
|
187
|
-
struct EmptyInner;
|
|
188
|
-
|
|
189
|
-
let formatted = format_pdfium_error(EmptyInner);
|
|
190
|
-
assert_eq!(formatted, "EmptyInner");
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
#[test]
|
|
194
|
-
fn test_format_pdfium_error_cleans_trailing_comma() {
|
|
195
|
-
#[derive(Debug)]
|
|
196
|
-
#[allow(dead_code)]
|
|
197
|
-
enum PdfiumError {
|
|
198
|
-
PdfiumLibraryInternalError(InternalError),
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
#[derive(Debug)]
|
|
202
|
-
#[allow(dead_code)]
|
|
203
|
-
enum InternalError {
|
|
204
|
-
FormatError,
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
let error = PdfiumError::PdfiumLibraryInternalError(InternalError::FormatError);
|
|
208
|
-
let formatted = format_pdfium_error(error);
|
|
209
|
-
|
|
210
|
-
assert!(!formatted.contains(",)"));
|
|
211
|
-
assert!(formatted.contains("PdfiumLibraryInternalError"));
|
|
212
|
-
assert!(formatted.contains("FormatError"));
|
|
213
|
-
}
|
|
214
122
|
}
|