kreuzberg 4.2.8 → 4.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a1e253ec8ac69b4394b7f68cd0d9d2050e66885139219a664ee4cff4f7e4c2f
4
- data.tar.gz: f6a6998344418328b89aa4a220a080b2708d0540ceb643a12e815fe5040c088b
3
+ metadata.gz: 1bdd32141526f545868c567acbc8e3a7caf94b4ff7e42bebf859fe33416669e4
4
+ data.tar.gz: 10da5a6da3a781b9676ba1213a535a69edde90b89ccad45489fab9fb593f5f73
5
5
  SHA512:
6
- metadata.gz: c3b982490411f182ee4567e2c692c9cfc9d0f1a670db081d73ea20d707c4e8fb26a12da1e077f405cea1c85f219b3971c00ecee2ffc03bce740cfd0bbe2c65ec
7
- data.tar.gz: c93cf5fb1319aac459129d5266abd02d46731574ddd741edb357ae30f1bb2a42d75eac0e5ff0dd4af8b26dc0fc0da646a1854438c0467ef299d8156f87e5d4c5
6
+ metadata.gz: e45428f1c646ed0683f51fa932c2432b0563d3258912fbe7b49f75acf0cdbc43c844c92b17cf7d4a5ddccb0b010d23cce4b20de950877fbe64ecafb858312bc5
7
+ data.tar.gz: f0abcd49fe46a4f0e3e2bf80e217ff36970b4a6037ecec6ea889230605a83178d76bff31d0960d50fb2ad4e1ea6f703c595bd43c244ff0e082ab365eb86bf02a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.8)
4
+ kreuzberg (4.2.9)
5
5
  rb_sys (~> 0.9.119)
6
6
 
7
7
  GEM
@@ -209,7 +209,7 @@ CHECKSUMS
209
209
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
210
210
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
211
211
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
212
- kreuzberg (4.2.8)
212
+ kreuzberg (4.2.9)
213
213
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
214
214
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
215
215
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.8" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.9" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.8'
4
+ VERSION = '4.2.9'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.8"
6
+ version = "4.2.9"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.8"
3
+ version = "4.2.9"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.8 Release**
20
+ > **🚀 Version 4.2.9 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -4,7 +4,7 @@
4
4
 
5
5
  use crate::Result;
6
6
  use crate::core::config::ExtractionConfig;
7
- use crate::types::PageContent;
7
+ use crate::types::{PageBoundary, PageContent};
8
8
 
9
9
  #[cfg(feature = "pdf")]
10
10
  use crate::types::Table;
@@ -17,6 +17,7 @@ pub(crate) type PdfExtractionPhaseResult = (
17
17
  String,
18
18
  Vec<Table>,
19
19
  Option<Vec<PageContent>>,
20
+ Option<Vec<PageBoundary>>,
20
21
  );
21
22
 
22
23
  /// Extract text, metadata, and tables from a PDF document using a single shared instance.
@@ -41,17 +42,18 @@ pub(crate) type PdfExtractionPhaseResult = (
41
42
  /// - Native extracted text (or empty if using OCR)
42
43
  /// - Extracted tables (if OCR feature enabled)
43
44
  /// - Per-page content (if page extraction configured)
45
+ /// - Page boundaries for per-page OCR evaluation
44
46
  #[cfg(feature = "pdf")]
45
47
  pub(crate) fn extract_all_from_document(
46
48
  document: &PdfDocument,
47
49
  config: &ExtractionConfig,
48
50
  ) -> Result<PdfExtractionPhaseResult> {
49
- let (native_text, _boundaries, page_contents, pdf_metadata) =
51
+ let (native_text, boundaries, page_contents, pdf_metadata) =
50
52
  crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
51
53
 
52
54
  let tables = extract_tables_from_document(document, &pdf_metadata)?;
53
55
 
54
- Ok((pdf_metadata, native_text, tables, page_contents))
56
+ Ok((pdf_metadata, native_text, tables, page_contents, boundaries))
55
57
  }
56
58
 
57
59
  /// Extract tables from PDF document using native text positions.
@@ -22,7 +22,7 @@ use crate::pdf::error::PdfError;
22
22
 
23
23
  // Re-export for backward compatibility
24
24
  #[cfg(feature = "ocr")]
25
- pub use ocr::{NativeTextStats, OcrFallbackDecision, evaluate_native_text_for_ocr};
25
+ pub use ocr::{NativeTextStats, OcrFallbackDecision, evaluate_native_text_for_ocr, evaluate_per_page_ocr};
26
26
 
27
27
  use extraction::extract_all_from_document;
28
28
  #[cfg(feature = "ocr")]
@@ -78,7 +78,7 @@ impl DocumentExtractor for PdfExtractor {
78
78
  config: &ExtractionConfig,
79
79
  ) -> Result<ExtractionResult> {
80
80
  #[cfg(feature = "pdf")]
81
- let (pdf_metadata, native_text, tables, page_contents) = {
81
+ let (pdf_metadata, native_text, tables, page_contents, _boundaries) = {
82
82
  #[cfg(target_arch = "wasm32")]
83
83
  {
84
84
  let pdfium = crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")
@@ -128,7 +128,7 @@ impl DocumentExtractor for PdfExtractor {
128
128
  }
129
129
  })?;
130
130
 
131
- let (pdf_metadata, native_text, tables, page_contents) =
131
+ let (pdf_metadata, native_text, tables, page_contents, _boundaries) =
132
132
  extract_all_from_document(&document, &config_owned)?;
133
133
 
134
134
  if let Some(page_cfg) = config_owned.pages.as_ref()
@@ -142,7 +142,13 @@ impl DocumentExtractor for PdfExtractor {
142
142
  .into());
143
143
  }
144
144
 
145
- Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
145
+ Ok::<_, crate::error::KreuzbergError>((
146
+ pdf_metadata,
147
+ native_text,
148
+ tables,
149
+ page_contents,
150
+ _boundaries,
151
+ ))
146
152
  })
147
153
  .await
148
154
  .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
@@ -188,7 +194,11 @@ impl DocumentExtractor for PdfExtractor {
188
194
  native_text
189
195
  }
190
196
  } else if config.ocr.is_some() {
191
- let decision = ocr::evaluate_native_text_for_ocr(&native_text, None);
197
+ let decision = ocr::evaluate_per_page_ocr(
198
+ &native_text,
199
+ _boundaries.as_deref(),
200
+ pdf_metadata.pdf_specific.page_count,
201
+ );
192
202
 
193
203
  if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
194
204
  eprintln!(
@@ -365,6 +375,159 @@ mod tests {
365
375
  assert!(ocr::evaluate_native_text_for_ocr(sample, Some(2)).fallback);
366
376
  }
367
377
 
378
+ #[cfg(feature = "ocr")]
379
+ #[test]
380
+ fn test_per_page_ocr_no_boundaries_falls_back_to_whole_doc() {
381
+ let text = "This document has enough meaningful words for evaluation purposes here.";
382
+ let decision = ocr::evaluate_per_page_ocr(text, None, Some(1));
383
+ assert!(!decision.fallback);
384
+ }
385
+
386
+ #[cfg(feature = "ocr")]
387
+ #[test]
388
+ fn test_per_page_ocr_empty_boundaries_falls_back_to_whole_doc() {
389
+ let text = "This document has enough meaningful words for evaluation purposes here.";
390
+ let decision = ocr::evaluate_per_page_ocr(text, Some(&[]), Some(1));
391
+ assert!(!decision.fallback);
392
+ }
393
+
394
+ #[cfg(feature = "ocr")]
395
+ #[test]
396
+ fn test_per_page_ocr_all_pages_good() {
397
+ use crate::types::PageBoundary;
398
+
399
+ let page1 = "This first page has plenty of meaningful searchable text content here.";
400
+ let page2 = "This second page also has plenty of meaningful searchable text content.";
401
+ let text = format!("{}{}", page1, page2);
402
+ let boundaries = vec![
403
+ PageBoundary {
404
+ byte_start: 0,
405
+ byte_end: page1.len(),
406
+ page_number: 1,
407
+ },
408
+ PageBoundary {
409
+ byte_start: page1.len(),
410
+ byte_end: text.len(),
411
+ page_number: 2,
412
+ },
413
+ ];
414
+
415
+ let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
416
+ assert!(!decision.fallback);
417
+ }
418
+
419
+ #[cfg(feature = "ocr")]
420
+ #[test]
421
+ fn test_per_page_ocr_one_bad_page_triggers_fallback() {
422
+ use crate::types::PageBoundary;
423
+
424
+ let good_page = "This page has plenty of meaningful searchable text content for extraction.";
425
+ let bad_page = " . ; ";
426
+ let text = format!("{}{}", good_page, bad_page);
427
+ let boundaries = vec![
428
+ PageBoundary {
429
+ byte_start: 0,
430
+ byte_end: good_page.len(),
431
+ page_number: 1,
432
+ },
433
+ PageBoundary {
434
+ byte_start: good_page.len(),
435
+ byte_end: text.len(),
436
+ page_number: 2,
437
+ },
438
+ ];
439
+
440
+ let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
441
+ assert!(decision.fallback);
442
+ }
443
+
444
+ #[cfg(feature = "ocr")]
445
+ #[test]
446
+ fn test_per_page_ocr_empty_page_triggers_fallback() {
447
+ use crate::types::PageBoundary;
448
+
449
+ let good_page = "This page has plenty of meaningful searchable text content for extraction.";
450
+ let empty_page = "";
451
+ let text = format!("{}{}", good_page, empty_page);
452
+ let boundaries = vec![
453
+ PageBoundary {
454
+ byte_start: 0,
455
+ byte_end: good_page.len(),
456
+ page_number: 1,
457
+ },
458
+ PageBoundary {
459
+ byte_start: good_page.len(),
460
+ byte_end: text.len(),
461
+ page_number: 2,
462
+ },
463
+ ];
464
+
465
+ let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
466
+ assert!(decision.fallback);
467
+ }
468
+
469
+ #[cfg(feature = "ocr")]
470
+ #[test]
471
+ fn test_per_page_ocr_preserves_document_stats_on_fallback() {
472
+ use crate::types::PageBoundary;
473
+
474
+ let good_page = "This page has plenty of meaningful searchable text content for extraction.";
475
+ let bad_page = " . ; ";
476
+ let text = format!("{}{}", good_page, bad_page);
477
+ let boundaries = vec![
478
+ PageBoundary {
479
+ byte_start: 0,
480
+ byte_end: good_page.len(),
481
+ page_number: 1,
482
+ },
483
+ PageBoundary {
484
+ byte_start: good_page.len(),
485
+ byte_end: text.len(),
486
+ page_number: 2,
487
+ },
488
+ ];
489
+
490
+ let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
491
+ assert!(decision.fallback);
492
+ assert!(decision.stats.non_whitespace > 0);
493
+ assert!(decision.stats.meaningful_words > 0);
494
+ }
495
+
496
+ #[cfg(feature = "ocr")]
497
+ #[test]
498
+ fn test_per_page_ocr_invalid_boundaries_skipped() {
499
+ use crate::types::PageBoundary;
500
+
501
+ let text = "This page has plenty of meaningful searchable text content for extraction.";
502
+ let boundaries = vec![
503
+ PageBoundary {
504
+ byte_start: 0,
505
+ byte_end: text.len(),
506
+ page_number: 1,
507
+ },
508
+ PageBoundary {
509
+ byte_start: 999,
510
+ byte_end: 9999,
511
+ page_number: 2,
512
+ },
513
+ ];
514
+
515
+ let decision = ocr::evaluate_per_page_ocr(text, Some(&boundaries), Some(1));
516
+ assert!(!decision.fallback);
517
+ }
518
+
519
+ #[cfg(feature = "ocr")]
520
+ #[test]
521
+ fn test_per_page_ocr_multi_page_correct_page_count() {
522
+ let text = "ab cd ef";
523
+ let decision_wrong = ocr::evaluate_native_text_for_ocr(text, None);
524
+ let decision_correct = ocr::evaluate_native_text_for_ocr(text, Some(20));
525
+ assert!(
526
+ decision_correct.avg_non_whitespace < decision_wrong.avg_non_whitespace,
527
+ "Correct page count should produce lower per-page averages"
528
+ );
529
+ }
530
+
368
531
  #[tokio::test]
369
532
  #[cfg(feature = "pdf")]
370
533
  async fn test_pdf_batch_mode_validates_page_config_enabled() {
@@ -139,6 +139,33 @@ pub fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>
139
139
  }
140
140
  }
141
141
 
142
+ #[cfg(feature = "ocr")]
143
+ pub fn evaluate_per_page_ocr(
144
+ native_text: &str,
145
+ boundaries: Option<&[crate::types::PageBoundary]>,
146
+ page_count: Option<usize>,
147
+ ) -> OcrFallbackDecision {
148
+ let boundaries = match boundaries {
149
+ Some(b) if !b.is_empty() => b,
150
+ _ => return evaluate_native_text_for_ocr(native_text, page_count),
151
+ };
152
+
153
+ let mut document_decision = evaluate_native_text_for_ocr(native_text, page_count);
154
+
155
+ for boundary in boundaries {
156
+ if boundary.byte_end > native_text.len() || boundary.byte_start > boundary.byte_end {
157
+ continue;
158
+ }
159
+ let page_text = &native_text[boundary.byte_start..boundary.byte_end];
160
+ if evaluate_native_text_for_ocr(page_text, Some(1)).fallback {
161
+ document_decision.fallback = true;
162
+ return document_decision;
163
+ }
164
+ }
165
+
166
+ document_decision
167
+ }
168
+
142
169
  /// Extract text from PDF using OCR.
143
170
  ///
144
171
  /// Renders all pages to images and processes them with OCR backend.
@@ -15,9 +15,6 @@ pub struct ExtractFileParams {
15
15
  /// Extraction configuration (JSON object)
16
16
  #[serde(skip_serializing_if = "Option::is_none")]
17
17
  pub config: Option<serde_json::Value>,
18
- /// Use async extraction (default: false for sync)
19
- #[serde(default)]
20
- pub r#async: bool,
21
18
  }
22
19
 
23
20
  /// Request parameters for bytes extraction.
@@ -31,9 +28,6 @@ pub struct ExtractBytesParams {
31
28
  /// Extraction configuration (JSON object)
32
29
  #[serde(skip_serializing_if = "Option::is_none")]
33
30
  pub config: Option<serde_json::Value>,
34
- /// Use async extraction (default: false for sync)
35
- #[serde(default)]
36
- pub r#async: bool,
37
31
  }
38
32
 
39
33
  /// Request parameters for batch file extraction.
@@ -44,9 +38,6 @@ pub struct BatchExtractFilesParams {
44
38
  /// Extraction configuration (JSON object)
45
39
  #[serde(skip_serializing_if = "Option::is_none")]
46
40
  pub config: Option<serde_json::Value>,
47
- /// Use async extraction (default: false for sync)
48
- #[serde(default)]
49
- pub r#async: bool,
50
41
  }
51
42
 
52
43
  /// Request parameters for MIME type detection.
@@ -75,7 +66,6 @@ mod tests {
75
66
  assert_eq!(params.path, "/test.pdf");
76
67
  assert_eq!(params.mime_type, None);
77
68
  assert_eq!(params.config, None);
78
- assert!(!params.r#async);
79
69
  }
80
70
 
81
71
  #[test]
@@ -86,7 +76,6 @@ mod tests {
86
76
  assert_eq!(params.data, "SGVsbG8=");
87
77
  assert_eq!(params.mime_type, None);
88
78
  assert_eq!(params.config, None);
89
- assert!(!params.r#async);
90
79
  }
91
80
 
92
81
  #[test]
@@ -96,7 +85,6 @@ mod tests {
96
85
 
97
86
  assert_eq!(params.paths.len(), 2);
98
87
  assert_eq!(params.config, None);
99
- assert!(!params.r#async);
100
88
  }
101
89
 
102
90
  #[test]
@@ -131,7 +119,6 @@ mod tests {
131
119
  path: "/test.pdf".to_string(),
132
120
  mime_type: Some("application/pdf".to_string()),
133
121
  config: Some(serde_json::json!({"use_cache": false})),
134
- r#async: true,
135
122
  };
136
123
 
137
124
  let json = serde_json::to_string(&params).unwrap();
@@ -140,7 +127,6 @@ mod tests {
140
127
  assert_eq!(params.path, deserialized.path);
141
128
  assert_eq!(params.mime_type, deserialized.mime_type);
142
129
  assert_eq!(params.config, deserialized.config);
143
- assert_eq!(params.r#async, deserialized.r#async);
144
130
  }
145
131
 
146
132
  #[test]
@@ -149,7 +135,6 @@ mod tests {
149
135
  data: "SGVsbG8=".to_string(),
150
136
  mime_type: None,
151
137
  config: None,
152
- r#async: false,
153
138
  };
154
139
 
155
140
  let json = serde_json::to_string(&params).unwrap();
@@ -163,7 +148,6 @@ mod tests {
163
148
  let params = BatchExtractFilesParams {
164
149
  paths: vec!["/a.pdf".to_string(), "/b.pdf".to_string()],
165
150
  config: Some(serde_json::json!({"use_cache": true})),
166
- r#async: true,
167
151
  };
168
152
 
169
153
  let json = serde_json::to_string(&params).unwrap();
@@ -68,6 +68,10 @@ impl KreuzbergMcp {
68
68
  ///
69
69
  /// This tool extracts text, metadata, and tables from documents in various formats
70
70
  /// including PDFs, Word documents, Excel spreadsheets, images (with OCR), and more.
71
+ ///
72
+ /// Note: The `async` parameter is accepted for API compatibility but ignored.
73
+ /// Extraction always runs asynchronously since the MCP server operates within
74
+ /// a Tokio runtime. Using sync wrappers would cause a nested runtime panic.
71
75
  #[tool(
72
76
  description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more.",
73
77
  annotations(title = "Extract File", read_only_hint = true, idempotent_hint = true)
@@ -78,18 +82,17 @@ impl KreuzbergMcp {
78
82
  ) -> Result<CallToolResult, rmcp::ErrorData> {
79
83
  use super::errors::map_kreuzberg_error_to_mcp;
80
84
  use super::format::{build_config, format_extraction_result};
81
- use crate::{extract_file, extract_file_sync};
85
+ use crate::extract_file;
82
86
 
83
87
  let config =
84
88
  build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
85
89
 
86
- let result = if params.r#async {
87
- extract_file(&params.path, params.mime_type.as_deref(), &config)
88
- .await
89
- .map_err(map_kreuzberg_error_to_mcp)?
90
- } else {
91
- extract_file_sync(&params.path, params.mime_type.as_deref(), &config).map_err(map_kreuzberg_error_to_mcp)?
92
- };
90
+ // Always use async extraction - we're already in a Tokio runtime context.
91
+ // Calling sync wrappers (which use GLOBAL_RUNTIME.block_on()) from within
92
+ // an async context causes "Cannot start a runtime from within a runtime" panic.
93
+ let result = extract_file(&params.path, params.mime_type.as_deref(), &config)
94
+ .await
95
+ .map_err(map_kreuzberg_error_to_mcp)?;
93
96
 
94
97
  let response = format_extraction_result(&result);
95
98
  Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -98,6 +101,10 @@ impl KreuzbergMcp {
98
101
  /// Extract content from base64-encoded bytes.
99
102
  ///
100
103
  /// This tool extracts text, metadata, and tables from base64-encoded document data.
104
+ ///
105
+ /// Note: The `async` parameter is accepted for API compatibility but ignored.
106
+ /// Extraction always runs asynchronously since the MCP server operates within
107
+ /// a Tokio runtime. Using sync wrappers would cause a nested runtime panic.
101
108
  #[tool(
102
109
  description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables.",
103
110
  annotations(title = "Extract Bytes", read_only_hint = true, idempotent_hint = true)
@@ -108,7 +115,7 @@ impl KreuzbergMcp {
108
115
  ) -> Result<CallToolResult, rmcp::ErrorData> {
109
116
  use super::errors::map_kreuzberg_error_to_mcp;
110
117
  use super::format::{build_config, format_extraction_result};
111
- use crate::{extract_bytes, extract_bytes_sync};
118
+ use crate::extract_bytes;
112
119
  use base64::prelude::*;
113
120
 
114
121
  let bytes = BASE64_STANDARD
@@ -120,13 +127,10 @@ impl KreuzbergMcp {
120
127
 
121
128
  let mime_type = params.mime_type.as_deref().unwrap_or("");
122
129
 
123
- let result = if params.r#async {
124
- extract_bytes(&bytes, mime_type, &config)
125
- .await
126
- .map_err(map_kreuzberg_error_to_mcp)?
127
- } else {
128
- extract_bytes_sync(&bytes, mime_type, &config).map_err(map_kreuzberg_error_to_mcp)?
129
- };
130
+ // Always use async extraction - we're already in a Tokio runtime context.
131
+ let result = extract_bytes(&bytes, mime_type, &config)
132
+ .await
133
+ .map_err(map_kreuzberg_error_to_mcp)?;
130
134
 
131
135
  let response = format_extraction_result(&result);
132
136
  Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -135,6 +139,10 @@ impl KreuzbergMcp {
135
139
  /// Extract content from multiple files in parallel.
136
140
  ///
137
141
  /// This tool efficiently processes multiple documents simultaneously, useful for batch operations.
142
+ ///
143
+ /// Note: The `async` parameter is accepted for API compatibility but ignored.
144
+ /// Extraction always runs asynchronously since the MCP server operates within
145
+ /// a Tokio runtime. Using sync wrappers would cause a nested runtime panic.
138
146
  #[tool(
139
147
  description = "Extract content from multiple files in parallel. Returns results for all files.",
140
148
  annotations(title = "Batch Extract Files", read_only_hint = true, idempotent_hint = true)
@@ -145,18 +153,15 @@ impl KreuzbergMcp {
145
153
  ) -> Result<CallToolResult, rmcp::ErrorData> {
146
154
  use super::errors::map_kreuzberg_error_to_mcp;
147
155
  use super::format::build_config;
148
- use crate::{batch_extract_file, batch_extract_file_sync};
156
+ use crate::batch_extract_file;
149
157
 
150
158
  let config =
151
159
  build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
152
160
 
153
- let results = if params.r#async {
154
- batch_extract_file(params.paths.clone(), &config)
155
- .await
156
- .map_err(map_kreuzberg_error_to_mcp)?
157
- } else {
158
- batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
159
- };
161
+ // Always use async extraction - we're already in a Tokio runtime context.
162
+ let results = batch_extract_file(params.paths.clone(), &config)
163
+ .await
164
+ .map_err(map_kreuzberg_error_to_mcp)?;
160
165
 
161
166
  let response = serde_json::to_string_pretty(&results).unwrap_or_default();
162
167
  Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -3,8 +3,8 @@
3
3
  use base64::prelude::*;
4
4
  use std::borrow::Cow;
5
5
  use crate::{
6
- ExtractionConfig, batch_extract_file, batch_extract_file_sync, extract_bytes, extract_bytes_sync, extract_file,
7
- extract_file_sync, mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
6
+ ExtractionConfig, batch_extract_file, extract_bytes, extract_file,
7
+ mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
8
8
  mcp::params::{BatchExtractFilesParams, ExtractBytesParams, ExtractFileParams},
9
9
  };
10
10
  use rmcp::{
@@ -34,13 +34,9 @@ pub(in crate::mcp) trait ExtractionTool {
34
34
  let config = build_config(self.default_config(), params.config)
35
35
  .map_err(|e| McpError::invalid_params(e, None))?;
36
36
 
37
- let result = if params.r#async {
38
- extract_file(&params.path, params.mime_type.as_deref(), &config)
39
- .await
40
- .map_err(map_kreuzberg_error_to_mcp)?
41
- } else {
42
- extract_file_sync(&params.path, params.mime_type.as_deref(), &config).map_err(map_kreuzberg_error_to_mcp)?
43
- };
37
+ let result = extract_file(&params.path, params.mime_type.as_deref(), &config)
38
+ .await
39
+ .map_err(map_kreuzberg_error_to_mcp)?;
44
40
 
45
41
  let response = format_extraction_result(&result);
46
42
  Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -66,13 +62,9 @@ pub(in crate::mcp) trait ExtractionTool {
66
62
 
67
63
  let mime_type = params.mime_type.as_deref().unwrap_or("");
68
64
 
69
- let result = if params.r#async {
70
- extract_bytes(&bytes, mime_type, &config)
71
- .await
72
- .map_err(map_kreuzberg_error_to_mcp)?
73
- } else {
74
- extract_bytes_sync(&bytes, mime_type, &config).map_err(map_kreuzberg_error_to_mcp)?
75
- };
65
+ let result = extract_bytes(&bytes, mime_type, &config)
66
+ .await
67
+ .map_err(map_kreuzberg_error_to_mcp)?;
76
68
 
77
69
  let response = format_extraction_result(&result);
78
70
  Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -92,13 +84,9 @@ pub(in crate::mcp) trait ExtractionTool {
92
84
  let config = build_config(self.default_config(), params.config)
93
85
  .map_err(|e| McpError::invalid_params(e, None))?;
94
86
 
95
- let results = if params.r#async {
96
- batch_extract_file(params.paths.clone(), &config)
97
- .await
98
- .map_err(map_kreuzberg_error_to_mcp)?
99
- } else {
100
- batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
101
- };
87
+ let results = batch_extract_file(params.paths.clone(), &config)
88
+ .await
89
+ .map_err(map_kreuzberg_error_to_mcp)?;
102
90
 
103
91
  let response = serde_json::to_string_pretty(&results).unwrap_or_default();
104
92
  Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -152,8 +140,7 @@ mod tests {
152
140
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
153
141
  mime_type: None,
154
142
  config: None,
155
- r#async: true,
156
- };
143
+ };
157
144
 
158
145
  let result = server.extract_file(Parameters(params)).await;
159
146
 
@@ -179,8 +166,7 @@ mod tests {
179
166
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
180
167
  mime_type: None,
181
168
  config: None,
182
- r#async: true,
183
- };
169
+ };
184
170
 
185
171
  let result = server.extract_file(Parameters(params)).await;
186
172
 
@@ -205,8 +191,7 @@ mod tests {
205
191
  path: "/nonexistent/file.pdf".to_string(),
206
192
  mime_type: None,
207
193
  config: None,
208
- r#async: true,
209
- };
194
+ };
210
195
 
211
196
  let result = server.extract_file(Parameters(params)).await;
212
197
 
@@ -222,8 +207,7 @@ mod tests {
222
207
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
223
208
  mime_type: Some(Cow::Borrowed("application/pdf")),
224
209
  config: None,
225
- r#async: true,
226
- };
210
+ };
227
211
 
228
212
  let result = server.extract_file(Parameters(params)).await;
229
213
 
@@ -241,8 +225,7 @@ mod tests {
241
225
  data: encoded,
242
226
  mime_type: Some(Cow::Borrowed("text/plain")),
243
227
  config: None,
244
- r#async: true,
245
- };
228
+ };
246
229
 
247
230
  let result = server.extract_bytes(Parameters(params)).await;
248
231
 
@@ -268,8 +251,7 @@ mod tests {
268
251
  data: "not-valid-base64!!!".to_string(),
269
252
  mime_type: None,
270
253
  config: None,
271
- r#async: true,
272
- };
254
+ };
273
255
 
274
256
  let result = server.extract_bytes(Parameters(params)).await;
275
257
 
@@ -285,8 +267,7 @@ mod tests {
285
267
  let params = BatchExtractFilesParams {
286
268
  paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
287
269
  config: None,
288
- r#async: true,
289
- };
270
+ };
290
271
 
291
272
  let result = server.batch_extract_files(Parameters(params)).await;
292
273
 
@@ -311,8 +292,7 @@ mod tests {
311
292
  let params = BatchExtractFilesParams {
312
293
  paths: vec![],
313
294
  config: None,
314
- r#async: true,
315
- };
295
+ };
316
296
 
317
297
  let result = server.batch_extract_files(Parameters(params)).await;
318
298
 
@@ -341,8 +321,7 @@ mod tests {
341
321
  path: test_file.to_string(),
342
322
  mime_type: None,
343
323
  config: None,
344
- r#async: true,
345
- };
324
+ };
346
325
 
347
326
  let result = server.extract_file(Parameters(params)).await;
348
327
 
@@ -368,8 +347,7 @@ mod tests {
368
347
  let params = BatchExtractFilesParams {
369
348
  paths: vec![file1.to_string(), file2.to_string()],
370
349
  config: None,
371
- r#async: true,
372
- };
350
+ };
373
351
 
374
352
  let result = server.batch_extract_files(Parameters(params)).await;
375
353
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.8"
3
+ version = "4.2.9"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.8
4
+ version: 4.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-02 00:00:00.000000000 Z
11
+ date: 2026-02-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys