kreuzberg 4.2.8 → 4.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +5 -3
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +168 -5
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +27 -0
- data/vendor/kreuzberg/src/mcp/params.rs +0 -16
- data/vendor/kreuzberg/src/mcp/server.rs +29 -24
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +21 -43
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1bdd32141526f545868c567acbc8e3a7caf94b4ff7e42bebf859fe33416669e4
|
|
4
|
+
data.tar.gz: 10da5a6da3a781b9676ba1213a535a69edde90b89ccad45489fab9fb593f5f73
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e45428f1c646ed0683f51fa932c2432b0563d3258912fbe7b49f75acf0cdbc43c844c92b17cf7d4a5ddccb0b010d23cce4b20de950877fbe64ecafb858312bc5
|
|
7
|
+
data.tar.gz: f0abcd49fe46a4f0e3e2bf80e217ff36970b4a6037ecec6ea889230605a83178d76bff31d0960d50fb2ad4e1ea6f703c595bd43c244ff0e082ab365eb86bf02a
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.9)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -209,7 +209,7 @@ CHECKSUMS
|
|
|
209
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
210
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
211
211
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
212
|
-
kreuzberg (4.2.
|
|
212
|
+
kreuzberg (4.2.9)
|
|
213
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
214
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
215
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.9" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.9 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::Result;
|
|
6
6
|
use crate::core::config::ExtractionConfig;
|
|
7
|
-
use crate::types::PageContent;
|
|
7
|
+
use crate::types::{PageBoundary, PageContent};
|
|
8
8
|
|
|
9
9
|
#[cfg(feature = "pdf")]
|
|
10
10
|
use crate::types::Table;
|
|
@@ -17,6 +17,7 @@ pub(crate) type PdfExtractionPhaseResult = (
|
|
|
17
17
|
String,
|
|
18
18
|
Vec<Table>,
|
|
19
19
|
Option<Vec<PageContent>>,
|
|
20
|
+
Option<Vec<PageBoundary>>,
|
|
20
21
|
);
|
|
21
22
|
|
|
22
23
|
/// Extract text, metadata, and tables from a PDF document using a single shared instance.
|
|
@@ -41,17 +42,18 @@ pub(crate) type PdfExtractionPhaseResult = (
|
|
|
41
42
|
/// - Native extracted text (or empty if using OCR)
|
|
42
43
|
/// - Extracted tables (if OCR feature enabled)
|
|
43
44
|
/// - Per-page content (if page extraction configured)
|
|
45
|
+
/// - Page boundaries for per-page OCR evaluation
|
|
44
46
|
#[cfg(feature = "pdf")]
|
|
45
47
|
pub(crate) fn extract_all_from_document(
|
|
46
48
|
document: &PdfDocument,
|
|
47
49
|
config: &ExtractionConfig,
|
|
48
50
|
) -> Result<PdfExtractionPhaseResult> {
|
|
49
|
-
let (native_text,
|
|
51
|
+
let (native_text, boundaries, page_contents, pdf_metadata) =
|
|
50
52
|
crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
|
|
51
53
|
|
|
52
54
|
let tables = extract_tables_from_document(document, &pdf_metadata)?;
|
|
53
55
|
|
|
54
|
-
Ok((pdf_metadata, native_text, tables, page_contents))
|
|
56
|
+
Ok((pdf_metadata, native_text, tables, page_contents, boundaries))
|
|
55
57
|
}
|
|
56
58
|
|
|
57
59
|
/// Extract tables from PDF document using native text positions.
|
|
@@ -22,7 +22,7 @@ use crate::pdf::error::PdfError;
|
|
|
22
22
|
|
|
23
23
|
// Re-export for backward compatibility
|
|
24
24
|
#[cfg(feature = "ocr")]
|
|
25
|
-
pub use ocr::{NativeTextStats, OcrFallbackDecision, evaluate_native_text_for_ocr};
|
|
25
|
+
pub use ocr::{NativeTextStats, OcrFallbackDecision, evaluate_native_text_for_ocr, evaluate_per_page_ocr};
|
|
26
26
|
|
|
27
27
|
use extraction::extract_all_from_document;
|
|
28
28
|
#[cfg(feature = "ocr")]
|
|
@@ -78,7 +78,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
78
78
|
config: &ExtractionConfig,
|
|
79
79
|
) -> Result<ExtractionResult> {
|
|
80
80
|
#[cfg(feature = "pdf")]
|
|
81
|
-
let (pdf_metadata, native_text, tables, page_contents) = {
|
|
81
|
+
let (pdf_metadata, native_text, tables, page_contents, _boundaries) = {
|
|
82
82
|
#[cfg(target_arch = "wasm32")]
|
|
83
83
|
{
|
|
84
84
|
let pdfium = crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")
|
|
@@ -128,7 +128,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
128
128
|
}
|
|
129
129
|
})?;
|
|
130
130
|
|
|
131
|
-
let (pdf_metadata, native_text, tables, page_contents) =
|
|
131
|
+
let (pdf_metadata, native_text, tables, page_contents, _boundaries) =
|
|
132
132
|
extract_all_from_document(&document, &config_owned)?;
|
|
133
133
|
|
|
134
134
|
if let Some(page_cfg) = config_owned.pages.as_ref()
|
|
@@ -142,7 +142,13 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
142
142
|
.into());
|
|
143
143
|
}
|
|
144
144
|
|
|
145
|
-
Ok::<_, crate::error::KreuzbergError>((
|
|
145
|
+
Ok::<_, crate::error::KreuzbergError>((
|
|
146
|
+
pdf_metadata,
|
|
147
|
+
native_text,
|
|
148
|
+
tables,
|
|
149
|
+
page_contents,
|
|
150
|
+
_boundaries,
|
|
151
|
+
))
|
|
146
152
|
})
|
|
147
153
|
.await
|
|
148
154
|
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
|
|
@@ -188,7 +194,11 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
188
194
|
native_text
|
|
189
195
|
}
|
|
190
196
|
} else if config.ocr.is_some() {
|
|
191
|
-
let decision = ocr::
|
|
197
|
+
let decision = ocr::evaluate_per_page_ocr(
|
|
198
|
+
&native_text,
|
|
199
|
+
_boundaries.as_deref(),
|
|
200
|
+
pdf_metadata.pdf_specific.page_count,
|
|
201
|
+
);
|
|
192
202
|
|
|
193
203
|
if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
|
|
194
204
|
eprintln!(
|
|
@@ -365,6 +375,159 @@ mod tests {
|
|
|
365
375
|
assert!(ocr::evaluate_native_text_for_ocr(sample, Some(2)).fallback);
|
|
366
376
|
}
|
|
367
377
|
|
|
378
|
+
#[cfg(feature = "ocr")]
|
|
379
|
+
#[test]
|
|
380
|
+
fn test_per_page_ocr_no_boundaries_falls_back_to_whole_doc() {
|
|
381
|
+
let text = "This document has enough meaningful words for evaluation purposes here.";
|
|
382
|
+
let decision = ocr::evaluate_per_page_ocr(text, None, Some(1));
|
|
383
|
+
assert!(!decision.fallback);
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
#[cfg(feature = "ocr")]
|
|
387
|
+
#[test]
|
|
388
|
+
fn test_per_page_ocr_empty_boundaries_falls_back_to_whole_doc() {
|
|
389
|
+
let text = "This document has enough meaningful words for evaluation purposes here.";
|
|
390
|
+
let decision = ocr::evaluate_per_page_ocr(text, Some(&[]), Some(1));
|
|
391
|
+
assert!(!decision.fallback);
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
#[cfg(feature = "ocr")]
|
|
395
|
+
#[test]
|
|
396
|
+
fn test_per_page_ocr_all_pages_good() {
|
|
397
|
+
use crate::types::PageBoundary;
|
|
398
|
+
|
|
399
|
+
let page1 = "This first page has plenty of meaningful searchable text content here.";
|
|
400
|
+
let page2 = "This second page also has plenty of meaningful searchable text content.";
|
|
401
|
+
let text = format!("{}{}", page1, page2);
|
|
402
|
+
let boundaries = vec![
|
|
403
|
+
PageBoundary {
|
|
404
|
+
byte_start: 0,
|
|
405
|
+
byte_end: page1.len(),
|
|
406
|
+
page_number: 1,
|
|
407
|
+
},
|
|
408
|
+
PageBoundary {
|
|
409
|
+
byte_start: page1.len(),
|
|
410
|
+
byte_end: text.len(),
|
|
411
|
+
page_number: 2,
|
|
412
|
+
},
|
|
413
|
+
];
|
|
414
|
+
|
|
415
|
+
let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
|
|
416
|
+
assert!(!decision.fallback);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
#[cfg(feature = "ocr")]
|
|
420
|
+
#[test]
|
|
421
|
+
fn test_per_page_ocr_one_bad_page_triggers_fallback() {
|
|
422
|
+
use crate::types::PageBoundary;
|
|
423
|
+
|
|
424
|
+
let good_page = "This page has plenty of meaningful searchable text content for extraction.";
|
|
425
|
+
let bad_page = " . ; ";
|
|
426
|
+
let text = format!("{}{}", good_page, bad_page);
|
|
427
|
+
let boundaries = vec![
|
|
428
|
+
PageBoundary {
|
|
429
|
+
byte_start: 0,
|
|
430
|
+
byte_end: good_page.len(),
|
|
431
|
+
page_number: 1,
|
|
432
|
+
},
|
|
433
|
+
PageBoundary {
|
|
434
|
+
byte_start: good_page.len(),
|
|
435
|
+
byte_end: text.len(),
|
|
436
|
+
page_number: 2,
|
|
437
|
+
},
|
|
438
|
+
];
|
|
439
|
+
|
|
440
|
+
let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
|
|
441
|
+
assert!(decision.fallback);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
#[cfg(feature = "ocr")]
|
|
445
|
+
#[test]
|
|
446
|
+
fn test_per_page_ocr_empty_page_triggers_fallback() {
|
|
447
|
+
use crate::types::PageBoundary;
|
|
448
|
+
|
|
449
|
+
let good_page = "This page has plenty of meaningful searchable text content for extraction.";
|
|
450
|
+
let empty_page = "";
|
|
451
|
+
let text = format!("{}{}", good_page, empty_page);
|
|
452
|
+
let boundaries = vec![
|
|
453
|
+
PageBoundary {
|
|
454
|
+
byte_start: 0,
|
|
455
|
+
byte_end: good_page.len(),
|
|
456
|
+
page_number: 1,
|
|
457
|
+
},
|
|
458
|
+
PageBoundary {
|
|
459
|
+
byte_start: good_page.len(),
|
|
460
|
+
byte_end: text.len(),
|
|
461
|
+
page_number: 2,
|
|
462
|
+
},
|
|
463
|
+
];
|
|
464
|
+
|
|
465
|
+
let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
|
|
466
|
+
assert!(decision.fallback);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
#[cfg(feature = "ocr")]
|
|
470
|
+
#[test]
|
|
471
|
+
fn test_per_page_ocr_preserves_document_stats_on_fallback() {
|
|
472
|
+
use crate::types::PageBoundary;
|
|
473
|
+
|
|
474
|
+
let good_page = "This page has plenty of meaningful searchable text content for extraction.";
|
|
475
|
+
let bad_page = " . ; ";
|
|
476
|
+
let text = format!("{}{}", good_page, bad_page);
|
|
477
|
+
let boundaries = vec![
|
|
478
|
+
PageBoundary {
|
|
479
|
+
byte_start: 0,
|
|
480
|
+
byte_end: good_page.len(),
|
|
481
|
+
page_number: 1,
|
|
482
|
+
},
|
|
483
|
+
PageBoundary {
|
|
484
|
+
byte_start: good_page.len(),
|
|
485
|
+
byte_end: text.len(),
|
|
486
|
+
page_number: 2,
|
|
487
|
+
},
|
|
488
|
+
];
|
|
489
|
+
|
|
490
|
+
let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
|
|
491
|
+
assert!(decision.fallback);
|
|
492
|
+
assert!(decision.stats.non_whitespace > 0);
|
|
493
|
+
assert!(decision.stats.meaningful_words > 0);
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
#[cfg(feature = "ocr")]
|
|
497
|
+
#[test]
|
|
498
|
+
fn test_per_page_ocr_invalid_boundaries_skipped() {
|
|
499
|
+
use crate::types::PageBoundary;
|
|
500
|
+
|
|
501
|
+
let text = "This page has plenty of meaningful searchable text content for extraction.";
|
|
502
|
+
let boundaries = vec![
|
|
503
|
+
PageBoundary {
|
|
504
|
+
byte_start: 0,
|
|
505
|
+
byte_end: text.len(),
|
|
506
|
+
page_number: 1,
|
|
507
|
+
},
|
|
508
|
+
PageBoundary {
|
|
509
|
+
byte_start: 999,
|
|
510
|
+
byte_end: 9999,
|
|
511
|
+
page_number: 2,
|
|
512
|
+
},
|
|
513
|
+
];
|
|
514
|
+
|
|
515
|
+
let decision = ocr::evaluate_per_page_ocr(text, Some(&boundaries), Some(1));
|
|
516
|
+
assert!(!decision.fallback);
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
#[cfg(feature = "ocr")]
|
|
520
|
+
#[test]
|
|
521
|
+
fn test_per_page_ocr_multi_page_correct_page_count() {
|
|
522
|
+
let text = "ab cd ef";
|
|
523
|
+
let decision_wrong = ocr::evaluate_native_text_for_ocr(text, None);
|
|
524
|
+
let decision_correct = ocr::evaluate_native_text_for_ocr(text, Some(20));
|
|
525
|
+
assert!(
|
|
526
|
+
decision_correct.avg_non_whitespace < decision_wrong.avg_non_whitespace,
|
|
527
|
+
"Correct page count should produce lower per-page averages"
|
|
528
|
+
);
|
|
529
|
+
}
|
|
530
|
+
|
|
368
531
|
#[tokio::test]
|
|
369
532
|
#[cfg(feature = "pdf")]
|
|
370
533
|
async fn test_pdf_batch_mode_validates_page_config_enabled() {
|
|
@@ -139,6 +139,33 @@ pub fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>
|
|
|
139
139
|
}
|
|
140
140
|
}
|
|
141
141
|
|
|
142
|
+
#[cfg(feature = "ocr")]
|
|
143
|
+
pub fn evaluate_per_page_ocr(
|
|
144
|
+
native_text: &str,
|
|
145
|
+
boundaries: Option<&[crate::types::PageBoundary]>,
|
|
146
|
+
page_count: Option<usize>,
|
|
147
|
+
) -> OcrFallbackDecision {
|
|
148
|
+
let boundaries = match boundaries {
|
|
149
|
+
Some(b) if !b.is_empty() => b,
|
|
150
|
+
_ => return evaluate_native_text_for_ocr(native_text, page_count),
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
let mut document_decision = evaluate_native_text_for_ocr(native_text, page_count);
|
|
154
|
+
|
|
155
|
+
for boundary in boundaries {
|
|
156
|
+
if boundary.byte_end > native_text.len() || boundary.byte_start > boundary.byte_end {
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
let page_text = &native_text[boundary.byte_start..boundary.byte_end];
|
|
160
|
+
if evaluate_native_text_for_ocr(page_text, Some(1)).fallback {
|
|
161
|
+
document_decision.fallback = true;
|
|
162
|
+
return document_decision;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
document_decision
|
|
167
|
+
}
|
|
168
|
+
|
|
142
169
|
/// Extract text from PDF using OCR.
|
|
143
170
|
///
|
|
144
171
|
/// Renders all pages to images and processes them with OCR backend.
|
|
@@ -15,9 +15,6 @@ pub struct ExtractFileParams {
|
|
|
15
15
|
/// Extraction configuration (JSON object)
|
|
16
16
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
17
17
|
pub config: Option<serde_json::Value>,
|
|
18
|
-
/// Use async extraction (default: false for sync)
|
|
19
|
-
#[serde(default)]
|
|
20
|
-
pub r#async: bool,
|
|
21
18
|
}
|
|
22
19
|
|
|
23
20
|
/// Request parameters for bytes extraction.
|
|
@@ -31,9 +28,6 @@ pub struct ExtractBytesParams {
|
|
|
31
28
|
/// Extraction configuration (JSON object)
|
|
32
29
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
33
30
|
pub config: Option<serde_json::Value>,
|
|
34
|
-
/// Use async extraction (default: false for sync)
|
|
35
|
-
#[serde(default)]
|
|
36
|
-
pub r#async: bool,
|
|
37
31
|
}
|
|
38
32
|
|
|
39
33
|
/// Request parameters for batch file extraction.
|
|
@@ -44,9 +38,6 @@ pub struct BatchExtractFilesParams {
|
|
|
44
38
|
/// Extraction configuration (JSON object)
|
|
45
39
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
46
40
|
pub config: Option<serde_json::Value>,
|
|
47
|
-
/// Use async extraction (default: false for sync)
|
|
48
|
-
#[serde(default)]
|
|
49
|
-
pub r#async: bool,
|
|
50
41
|
}
|
|
51
42
|
|
|
52
43
|
/// Request parameters for MIME type detection.
|
|
@@ -75,7 +66,6 @@ mod tests {
|
|
|
75
66
|
assert_eq!(params.path, "/test.pdf");
|
|
76
67
|
assert_eq!(params.mime_type, None);
|
|
77
68
|
assert_eq!(params.config, None);
|
|
78
|
-
assert!(!params.r#async);
|
|
79
69
|
}
|
|
80
70
|
|
|
81
71
|
#[test]
|
|
@@ -86,7 +76,6 @@ mod tests {
|
|
|
86
76
|
assert_eq!(params.data, "SGVsbG8=");
|
|
87
77
|
assert_eq!(params.mime_type, None);
|
|
88
78
|
assert_eq!(params.config, None);
|
|
89
|
-
assert!(!params.r#async);
|
|
90
79
|
}
|
|
91
80
|
|
|
92
81
|
#[test]
|
|
@@ -96,7 +85,6 @@ mod tests {
|
|
|
96
85
|
|
|
97
86
|
assert_eq!(params.paths.len(), 2);
|
|
98
87
|
assert_eq!(params.config, None);
|
|
99
|
-
assert!(!params.r#async);
|
|
100
88
|
}
|
|
101
89
|
|
|
102
90
|
#[test]
|
|
@@ -131,7 +119,6 @@ mod tests {
|
|
|
131
119
|
path: "/test.pdf".to_string(),
|
|
132
120
|
mime_type: Some("application/pdf".to_string()),
|
|
133
121
|
config: Some(serde_json::json!({"use_cache": false})),
|
|
134
|
-
r#async: true,
|
|
135
122
|
};
|
|
136
123
|
|
|
137
124
|
let json = serde_json::to_string(¶ms).unwrap();
|
|
@@ -140,7 +127,6 @@ mod tests {
|
|
|
140
127
|
assert_eq!(params.path, deserialized.path);
|
|
141
128
|
assert_eq!(params.mime_type, deserialized.mime_type);
|
|
142
129
|
assert_eq!(params.config, deserialized.config);
|
|
143
|
-
assert_eq!(params.r#async, deserialized.r#async);
|
|
144
130
|
}
|
|
145
131
|
|
|
146
132
|
#[test]
|
|
@@ -149,7 +135,6 @@ mod tests {
|
|
|
149
135
|
data: "SGVsbG8=".to_string(),
|
|
150
136
|
mime_type: None,
|
|
151
137
|
config: None,
|
|
152
|
-
r#async: false,
|
|
153
138
|
};
|
|
154
139
|
|
|
155
140
|
let json = serde_json::to_string(¶ms).unwrap();
|
|
@@ -163,7 +148,6 @@ mod tests {
|
|
|
163
148
|
let params = BatchExtractFilesParams {
|
|
164
149
|
paths: vec!["/a.pdf".to_string(), "/b.pdf".to_string()],
|
|
165
150
|
config: Some(serde_json::json!({"use_cache": true})),
|
|
166
|
-
r#async: true,
|
|
167
151
|
};
|
|
168
152
|
|
|
169
153
|
let json = serde_json::to_string(¶ms).unwrap();
|
|
@@ -68,6 +68,10 @@ impl KreuzbergMcp {
|
|
|
68
68
|
///
|
|
69
69
|
/// This tool extracts text, metadata, and tables from documents in various formats
|
|
70
70
|
/// including PDFs, Word documents, Excel spreadsheets, images (with OCR), and more.
|
|
71
|
+
///
|
|
72
|
+
/// Note: The `async` parameter is accepted for API compatibility but ignored.
|
|
73
|
+
/// Extraction always runs asynchronously since the MCP server operates within
|
|
74
|
+
/// a Tokio runtime. Using sync wrappers would cause a nested runtime panic.
|
|
71
75
|
#[tool(
|
|
72
76
|
description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more.",
|
|
73
77
|
annotations(title = "Extract File", read_only_hint = true, idempotent_hint = true)
|
|
@@ -78,18 +82,17 @@ impl KreuzbergMcp {
|
|
|
78
82
|
) -> Result<CallToolResult, rmcp::ErrorData> {
|
|
79
83
|
use super::errors::map_kreuzberg_error_to_mcp;
|
|
80
84
|
use super::format::{build_config, format_extraction_result};
|
|
81
|
-
use crate::
|
|
85
|
+
use crate::extract_file;
|
|
82
86
|
|
|
83
87
|
let config =
|
|
84
88
|
build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
|
|
85
89
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
};
|
|
90
|
+
// Always use async extraction - we're already in a Tokio runtime context.
|
|
91
|
+
// Calling sync wrappers (which use GLOBAL_RUNTIME.block_on()) from within
|
|
92
|
+
// an async context causes "Cannot start a runtime from within a runtime" panic.
|
|
93
|
+
let result = extract_file(¶ms.path, params.mime_type.as_deref(), &config)
|
|
94
|
+
.await
|
|
95
|
+
.map_err(map_kreuzberg_error_to_mcp)?;
|
|
93
96
|
|
|
94
97
|
let response = format_extraction_result(&result);
|
|
95
98
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
@@ -98,6 +101,10 @@ impl KreuzbergMcp {
|
|
|
98
101
|
/// Extract content from base64-encoded bytes.
|
|
99
102
|
///
|
|
100
103
|
/// This tool extracts text, metadata, and tables from base64-encoded document data.
|
|
104
|
+
///
|
|
105
|
+
/// Note: The `async` parameter is accepted for API compatibility but ignored.
|
|
106
|
+
/// Extraction always runs asynchronously since the MCP server operates within
|
|
107
|
+
/// a Tokio runtime. Using sync wrappers would cause a nested runtime panic.
|
|
101
108
|
#[tool(
|
|
102
109
|
description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables.",
|
|
103
110
|
annotations(title = "Extract Bytes", read_only_hint = true, idempotent_hint = true)
|
|
@@ -108,7 +115,7 @@ impl KreuzbergMcp {
|
|
|
108
115
|
) -> Result<CallToolResult, rmcp::ErrorData> {
|
|
109
116
|
use super::errors::map_kreuzberg_error_to_mcp;
|
|
110
117
|
use super::format::{build_config, format_extraction_result};
|
|
111
|
-
use crate::
|
|
118
|
+
use crate::extract_bytes;
|
|
112
119
|
use base64::prelude::*;
|
|
113
120
|
|
|
114
121
|
let bytes = BASE64_STANDARD
|
|
@@ -120,13 +127,10 @@ impl KreuzbergMcp {
|
|
|
120
127
|
|
|
121
128
|
let mime_type = params.mime_type.as_deref().unwrap_or("");
|
|
122
129
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
} else {
|
|
128
|
-
extract_bytes_sync(&bytes, mime_type, &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
129
|
-
};
|
|
130
|
+
// Always use async extraction - we're already in a Tokio runtime context.
|
|
131
|
+
let result = extract_bytes(&bytes, mime_type, &config)
|
|
132
|
+
.await
|
|
133
|
+
.map_err(map_kreuzberg_error_to_mcp)?;
|
|
130
134
|
|
|
131
135
|
let response = format_extraction_result(&result);
|
|
132
136
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
@@ -135,6 +139,10 @@ impl KreuzbergMcp {
|
|
|
135
139
|
/// Extract content from multiple files in parallel.
|
|
136
140
|
///
|
|
137
141
|
/// This tool efficiently processes multiple documents simultaneously, useful for batch operations.
|
|
142
|
+
///
|
|
143
|
+
/// Note: The `async` parameter is accepted for API compatibility but ignored.
|
|
144
|
+
/// Extraction always runs asynchronously since the MCP server operates within
|
|
145
|
+
/// a Tokio runtime. Using sync wrappers would cause a nested runtime panic.
|
|
138
146
|
#[tool(
|
|
139
147
|
description = "Extract content from multiple files in parallel. Returns results for all files.",
|
|
140
148
|
annotations(title = "Batch Extract Files", read_only_hint = true, idempotent_hint = true)
|
|
@@ -145,18 +153,15 @@ impl KreuzbergMcp {
|
|
|
145
153
|
) -> Result<CallToolResult, rmcp::ErrorData> {
|
|
146
154
|
use super::errors::map_kreuzberg_error_to_mcp;
|
|
147
155
|
use super::format::build_config;
|
|
148
|
-
use crate::
|
|
156
|
+
use crate::batch_extract_file;
|
|
149
157
|
|
|
150
158
|
let config =
|
|
151
159
|
build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
|
|
152
160
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
} else {
|
|
158
|
-
batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
159
|
-
};
|
|
161
|
+
// Always use async extraction - we're already in a Tokio runtime context.
|
|
162
|
+
let results = batch_extract_file(params.paths.clone(), &config)
|
|
163
|
+
.await
|
|
164
|
+
.map_err(map_kreuzberg_error_to_mcp)?;
|
|
160
165
|
|
|
161
166
|
let response = serde_json::to_string_pretty(&results).unwrap_or_default();
|
|
162
167
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
use base64::prelude::*;
|
|
4
4
|
use std::borrow::Cow;
|
|
5
5
|
use crate::{
|
|
6
|
-
ExtractionConfig, batch_extract_file,
|
|
7
|
-
|
|
6
|
+
ExtractionConfig, batch_extract_file, extract_bytes, extract_file,
|
|
7
|
+
mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
|
|
8
8
|
mcp::params::{BatchExtractFilesParams, ExtractBytesParams, ExtractFileParams},
|
|
9
9
|
};
|
|
10
10
|
use rmcp::{
|
|
@@ -34,13 +34,9 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
34
34
|
let config = build_config(self.default_config(), params.config)
|
|
35
35
|
.map_err(|e| McpError::invalid_params(e, None))?;
|
|
36
36
|
|
|
37
|
-
let result =
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
.map_err(map_kreuzberg_error_to_mcp)?
|
|
41
|
-
} else {
|
|
42
|
-
extract_file_sync(¶ms.path, params.mime_type.as_deref(), &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
43
|
-
};
|
|
37
|
+
let result = extract_file(¶ms.path, params.mime_type.as_deref(), &config)
|
|
38
|
+
.await
|
|
39
|
+
.map_err(map_kreuzberg_error_to_mcp)?;
|
|
44
40
|
|
|
45
41
|
let response = format_extraction_result(&result);
|
|
46
42
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
@@ -66,13 +62,9 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
66
62
|
|
|
67
63
|
let mime_type = params.mime_type.as_deref().unwrap_or("");
|
|
68
64
|
|
|
69
|
-
let result =
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
.map_err(map_kreuzberg_error_to_mcp)?
|
|
73
|
-
} else {
|
|
74
|
-
extract_bytes_sync(&bytes, mime_type, &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
75
|
-
};
|
|
65
|
+
let result = extract_bytes(&bytes, mime_type, &config)
|
|
66
|
+
.await
|
|
67
|
+
.map_err(map_kreuzberg_error_to_mcp)?;
|
|
76
68
|
|
|
77
69
|
let response = format_extraction_result(&result);
|
|
78
70
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
@@ -92,13 +84,9 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
92
84
|
let config = build_config(self.default_config(), params.config)
|
|
93
85
|
.map_err(|e| McpError::invalid_params(e, None))?;
|
|
94
86
|
|
|
95
|
-
let results =
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
.map_err(map_kreuzberg_error_to_mcp)?
|
|
99
|
-
} else {
|
|
100
|
-
batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
101
|
-
};
|
|
87
|
+
let results = batch_extract_file(params.paths.clone(), &config)
|
|
88
|
+
.await
|
|
89
|
+
.map_err(map_kreuzberg_error_to_mcp)?;
|
|
102
90
|
|
|
103
91
|
let response = serde_json::to_string_pretty(&results).unwrap_or_default();
|
|
104
92
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
@@ -152,8 +140,7 @@ mod tests {
|
|
|
152
140
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
153
141
|
mime_type: None,
|
|
154
142
|
config: None,
|
|
155
|
-
|
|
156
|
-
};
|
|
143
|
+
};
|
|
157
144
|
|
|
158
145
|
let result = server.extract_file(Parameters(params)).await;
|
|
159
146
|
|
|
@@ -179,8 +166,7 @@ mod tests {
|
|
|
179
166
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
180
167
|
mime_type: None,
|
|
181
168
|
config: None,
|
|
182
|
-
|
|
183
|
-
};
|
|
169
|
+
};
|
|
184
170
|
|
|
185
171
|
let result = server.extract_file(Parameters(params)).await;
|
|
186
172
|
|
|
@@ -205,8 +191,7 @@ mod tests {
|
|
|
205
191
|
path: "/nonexistent/file.pdf".to_string(),
|
|
206
192
|
mime_type: None,
|
|
207
193
|
config: None,
|
|
208
|
-
|
|
209
|
-
};
|
|
194
|
+
};
|
|
210
195
|
|
|
211
196
|
let result = server.extract_file(Parameters(params)).await;
|
|
212
197
|
|
|
@@ -222,8 +207,7 @@ mod tests {
|
|
|
222
207
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
223
208
|
mime_type: Some(Cow::Borrowed("application/pdf")),
|
|
224
209
|
config: None,
|
|
225
|
-
|
|
226
|
-
};
|
|
210
|
+
};
|
|
227
211
|
|
|
228
212
|
let result = server.extract_file(Parameters(params)).await;
|
|
229
213
|
|
|
@@ -241,8 +225,7 @@ mod tests {
|
|
|
241
225
|
data: encoded,
|
|
242
226
|
mime_type: Some(Cow::Borrowed("text/plain")),
|
|
243
227
|
config: None,
|
|
244
|
-
|
|
245
|
-
};
|
|
228
|
+
};
|
|
246
229
|
|
|
247
230
|
let result = server.extract_bytes(Parameters(params)).await;
|
|
248
231
|
|
|
@@ -268,8 +251,7 @@ mod tests {
|
|
|
268
251
|
data: "not-valid-base64!!!".to_string(),
|
|
269
252
|
mime_type: None,
|
|
270
253
|
config: None,
|
|
271
|
-
|
|
272
|
-
};
|
|
254
|
+
};
|
|
273
255
|
|
|
274
256
|
let result = server.extract_bytes(Parameters(params)).await;
|
|
275
257
|
|
|
@@ -285,8 +267,7 @@ mod tests {
|
|
|
285
267
|
let params = BatchExtractFilesParams {
|
|
286
268
|
paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
|
|
287
269
|
config: None,
|
|
288
|
-
|
|
289
|
-
};
|
|
270
|
+
};
|
|
290
271
|
|
|
291
272
|
let result = server.batch_extract_files(Parameters(params)).await;
|
|
292
273
|
|
|
@@ -311,8 +292,7 @@ mod tests {
|
|
|
311
292
|
let params = BatchExtractFilesParams {
|
|
312
293
|
paths: vec![],
|
|
313
294
|
config: None,
|
|
314
|
-
|
|
315
|
-
};
|
|
295
|
+
};
|
|
316
296
|
|
|
317
297
|
let result = server.batch_extract_files(Parameters(params)).await;
|
|
318
298
|
|
|
@@ -341,8 +321,7 @@ mod tests {
|
|
|
341
321
|
path: test_file.to_string(),
|
|
342
322
|
mime_type: None,
|
|
343
323
|
config: None,
|
|
344
|
-
|
|
345
|
-
};
|
|
324
|
+
};
|
|
346
325
|
|
|
347
326
|
let result = server.extract_file(Parameters(params)).await;
|
|
348
327
|
|
|
@@ -368,8 +347,7 @@ mod tests {
|
|
|
368
347
|
let params = BatchExtractFilesParams {
|
|
369
348
|
paths: vec![file1.to_string(), file2.to_string()],
|
|
370
349
|
config: None,
|
|
371
|
-
|
|
372
|
-
};
|
|
350
|
+
};
|
|
373
351
|
|
|
374
352
|
let result = server.batch_extract_files(Parameters(params)).await;
|
|
375
353
|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.2.
|
|
4
|
+
version: 4.2.9
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-02-
|
|
11
|
+
date: 2026-02-03 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|