kreuzberg 4.0.0.pre.rc.7 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +13 -12
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +1 -1
  5. data/ext/kreuzberg_rb/native/Cargo.lock +397 -183
  6. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  8. data/kreuzberg.gemspec +34 -2
  9. data/lib/kreuzberg/cache_api.rb +35 -0
  10. data/lib/kreuzberg/error_context.rb +49 -1
  11. data/lib/kreuzberg/extraction_api.rb +255 -0
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/lib/kreuzberg.rb +6 -0
  14. data/lib/libpdfium.dylib +0 -0
  15. data/sig/kreuzberg.rbs +9 -0
  16. data/vendor/Cargo.toml +44 -0
  17. data/vendor/kreuzberg/Cargo.toml +65 -35
  18. data/vendor/kreuzberg/README.md +50 -0
  19. data/vendor/kreuzberg/build.rs +548 -190
  20. data/vendor/kreuzberg/src/api/mod.rs +0 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  22. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  23. data/vendor/kreuzberg/src/error.rs +1 -1
  24. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  25. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  26. data/vendor/kreuzberg/src/extractors/pdf.rs +99 -47
  27. data/vendor/kreuzberg/src/mcp/mod.rs +3 -2
  28. data/vendor/kreuzberg/src/mcp/server.rs +106 -0
  29. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  30. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -0
  31. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  32. data/vendor/kreuzberg/src/pdf/mod.rs +6 -0
  33. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  34. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  35. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  36. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  37. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  38. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  39. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  40. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  41. data/vendor/kreuzberg-ffi/README.md +851 -0
  42. data/vendor/kreuzberg-ffi/build.rs +176 -0
  43. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  44. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  45. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  46. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  47. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  48. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  49. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  50. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  51. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  52. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  53. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  54. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  55. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  56. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  57. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  58. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  59. data/vendor/kreuzberg-tesseract/README.md +399 -0
  60. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  61. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  62. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  63. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  64. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  65. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  66. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  67. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  68. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  69. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  70. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  71. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  72. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  73. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  74. data/vendor/rb-sys/src/lib.rs +1 -0
  75. metadata +41 -3
  76. data/vendor/rb-sys/bin/release.sh +0 -22
@@ -1,5 +1,3 @@
1
- #![cfg(feature = "api")]
2
-
3
1
  //! REST API server for Kreuzberg document extraction.
4
2
  //!
5
3
  //! This module provides an Axum-based HTTP server for document extraction
@@ -653,6 +653,18 @@ mod tests {
653
653
  #[tokio::test]
654
654
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
655
655
  async fn test_pipeline_with_keyword_extraction() {
656
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
657
+ crate::plugins::registry::get_validator_registry()
658
+ .write()
659
+ .unwrap()
660
+ .shutdown_all()
661
+ .unwrap();
662
+ crate::plugins::registry::get_post_processor_registry()
663
+ .write()
664
+ .unwrap()
665
+ .shutdown_all()
666
+ .unwrap();
667
+
656
668
  let _ = crate::keywords::register_keyword_processor();
657
669
 
658
670
  let result = ExtractionResult {
@@ -703,6 +715,7 @@ Natural language processing enables computers to understand human language.
703
715
  #[tokio::test]
704
716
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
705
717
  async fn test_pipeline_without_keyword_config() {
718
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
706
719
  let result = ExtractionResult {
707
720
  content: "Machine learning and artificial intelligence.".to_string(),
708
721
  mime_type: "text/plain".to_string(),
@@ -11,6 +11,30 @@
11
11
  //! - Batch processing for efficient embedding generation
12
12
  //! - Optional GPU acceleration via ONNX Runtime execution providers
13
13
  //!
14
+ //! # ONNX Runtime Requirement
15
+ //!
16
+ //! **CRITICAL**: This module requires ONNX Runtime to be installed on the system.
17
+ //! The `embeddings` feature uses dynamic loading (`ort-load-dynamic`), which detects
18
+ //! the ONNX Runtime library at runtime.
19
+ //!
20
+ //! ## Installation Instructions
21
+ //!
22
+ //! - **macOS**: `brew install onnxruntime`
23
+ //! - **Linux (Ubuntu/Debian)**: `apt install libonnxruntime libonnxruntime-dev`
24
+ //! - **Linux (Fedora)**: `dnf install onnxruntime onnxruntime-devel`
25
+ //! - **Linux (Arch)**: `pacman -S onnxruntime`
26
+ //! - **Windows (MSVC)**: Download from https://github.com/microsoft/onnxruntime/releases and add to PATH
27
+ //!
28
+ //! Alternatively, set the `ORT_DYLIB_PATH` environment variable to the ONNX Runtime library path.
29
+ //!
30
+ //! For Docker/containers, install via package manager in your base image.
31
+ //! Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux.
32
+ //!
33
+ //! ## Platform Limitations
34
+ //!
35
+ //! **Windows MinGW builds are not supported**. ONNX Runtime requires the MSVC toolchain on Windows.
36
+ //! Please use Windows MSVC builds or disable the embeddings feature.
37
+ //!
14
38
  //! # Example
15
39
  //!
16
40
  //! ```rust,ignore
@@ -145,6 +169,35 @@ lazy_static! {
145
169
  static ref MODEL_CACHE: RwLock<HashMap<String, CachedEmbedding>> = RwLock::new(HashMap::new());
146
170
  }
147
171
 
172
+ /// Returns installation instructions for ONNX Runtime.
173
+ #[cfg(feature = "embeddings")]
174
+ fn onnx_runtime_install_message() -> String {
175
+ #[cfg(all(windows, target_env = "gnu"))]
176
+ {
177
+ return "ONNX Runtime embeddings are not supported on Windows MinGW builds. \
178
+ ONNX Runtime requires MSVC toolchain. \
179
+ Please use Windows MSVC builds or disable embeddings feature."
180
+ .to_string();
181
+ }
182
+
183
+ #[cfg(not(all(windows, target_env = "gnu")))]
184
+ {
185
+ "ONNX Runtime is required for embeddings functionality. \
186
+ Install: \
187
+ macOS: 'brew install onnxruntime', \
188
+ Linux (Ubuntu/Debian): 'apt install libonnxruntime libonnxruntime-dev', \
189
+ Linux (Fedora): 'dnf install onnxruntime onnxruntime-devel', \
190
+ Linux (Arch): 'pacman -S onnxruntime', \
191
+ Windows (MSVC): Download from https://github.com/microsoft/onnxruntime/releases and add to PATH. \
192
+ \
193
+ Alternatively, set ORT_DYLIB_PATH environment variable to the ONNX Runtime library path. \
194
+ \
195
+ For Docker/containers: Install via package manager in your base image. \
196
+ Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux."
197
+ .to_string()
198
+ }
199
+ }
200
+
148
201
  /// Get or initialize a text embedding model from cache.
149
202
  ///
150
203
  /// This function ensures models are initialized only once and reused across
@@ -193,9 +246,24 @@ pub fn get_or_init_model(
193
246
  let mut init_options = InitOptions::new(model);
194
247
  init_options = init_options.with_cache_dir(cache_directory);
195
248
 
196
- let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| crate::KreuzbergError::Plugin {
197
- message: format!("Failed to initialize embedding model: {}", e),
198
- plugin_name: "embeddings".to_string(),
249
+ let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| {
250
+ let error_msg = e.to_string();
251
+
252
+ // Detect ONNX Runtime loading errors by checking for common patterns
253
+ if error_msg.contains("onnxruntime")
254
+ || error_msg.contains("ORT")
255
+ || error_msg.contains("libonnxruntime")
256
+ || error_msg.contains("onnxruntime.dll")
257
+ || error_msg.contains("Unable to load")
258
+ || error_msg.contains("library load failed")
259
+ {
260
+ crate::KreuzbergError::MissingDependency(format!("ONNX Runtime - {}", onnx_runtime_install_message()))
261
+ } else {
262
+ crate::KreuzbergError::Plugin {
263
+ message: format!("Failed to initialize embedding model: {}", e),
264
+ plugin_name: "embeddings".to_string(),
265
+ }
266
+ }
199
267
  })?;
200
268
 
201
269
  let leaked_model = LeakedModel::new(embedding_model);
@@ -177,7 +177,7 @@ impl From<crate::pdf::error::PdfError> for KreuzbergError {
177
177
 
178
178
  macro_rules! error_constructor {
179
179
  ($name:ident, $variant:ident) => {
180
- paste::paste! {
180
+ pastey::paste! {
181
181
  #[doc = "Create a " $variant " error"]
182
182
  pub fn $name<S: Into<String>>(message: S) -> Self {
183
183
  Self::$variant {
@@ -384,7 +384,7 @@ mod tests {
384
384
  #[test]
385
385
  fn test_extract_text_with_page_breaks_no_breaks() {
386
386
  let docx_path =
387
- "/Users/naamanhirschfeld/workspace/kreuzberg-dev/kreuzberg/test_documents/documents/lorem_ipsum.docx";
387
+ std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/documents/lorem_ipsum.docx");
388
388
  if let Ok(bytes) = std::fs::read(docx_path) {
389
389
  let result = extract_text_with_page_breaks(&bytes);
390
390
  if let Ok((text, boundaries)) = result {
@@ -31,14 +31,26 @@ use html_to_markdown_rs::{
31
31
  convert as convert_html, convert_with_inline_images,
32
32
  };
33
33
  use serde::{Deserialize, Serialize};
34
- use std::{any::Any, collections::HashMap, thread};
34
+ use std::collections::HashMap;
35
+
36
+ #[cfg(not(target_arch = "wasm32"))]
37
+ use std::{any::Any, thread};
35
38
 
36
39
  pub use html_to_markdown_rs::{
37
40
  CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
38
41
  PreprocessingPreset, WhitespaceMode,
39
42
  };
40
43
 
44
+ // WASM has a much smaller stack, so we need a lower threshold
45
+ // In practice, WASM can't spawn threads anyway, so this threshold doesn't help much
46
+ // We set it very high to avoid the overhead of the "large stack" path which is a no-op in WASM
47
+ #[cfg(target_arch = "wasm32")]
48
+ const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = usize::MAX;
49
+
50
+ #[cfg(not(target_arch = "wasm32"))]
41
51
  const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
52
+
53
+ #[cfg(not(target_arch = "wasm32"))]
42
54
  const HTML_CONVERSION_STACK_SIZE_BYTES: usize = 16 * 1024 * 1024;
43
55
 
44
56
  /// Result of HTML extraction with optional images and warnings.
@@ -132,10 +144,6 @@ fn convert_html_with_options(html: &str, options: ConversionOptions) -> Result<S
132
144
  .map_err(|e| KreuzbergError::parsing(format!("Failed to convert HTML to Markdown: {}", e)))
133
145
  }
134
146
 
135
- fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
136
- run_on_dedicated_stack(move || convert_html_with_options(&html, options))
137
- }
138
-
139
147
  fn convert_inline_images_with_options(
140
148
  html: &str,
141
149
  options: ConversionOptions,
@@ -145,6 +153,13 @@ fn convert_inline_images_with_options(
145
153
  .map_err(|e| KreuzbergError::parsing(format!("Failed to convert HTML to Markdown with images: {}", e)))
146
154
  }
147
155
 
156
+ // Native (non-WASM) implementations use dedicated thread stack for large HTML documents
157
+ #[cfg(not(target_arch = "wasm32"))]
158
+ fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
159
+ run_on_dedicated_stack(move || convert_html_with_options(&html, options))
160
+ }
161
+
162
+ #[cfg(not(target_arch = "wasm32"))]
148
163
  fn convert_inline_images_with_large_stack(
149
164
  html: String,
150
165
  options: ConversionOptions,
@@ -153,6 +168,7 @@ fn convert_inline_images_with_large_stack(
153
168
  run_on_dedicated_stack(move || convert_inline_images_with_options(&html, options, image_config))
154
169
  }
155
170
 
171
+ #[cfg(not(target_arch = "wasm32"))]
156
172
  fn run_on_dedicated_stack<T, F>(job: F) -> Result<T>
157
173
  where
158
174
  T: Send + 'static,
@@ -173,6 +189,7 @@ where
173
189
  }
174
190
  }
175
191
 
192
+ #[cfg(not(target_arch = "wasm32"))]
176
193
  fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
177
194
  if let Some(msg) = panic.downcast_ref::<&str>() {
178
195
  (*msg).to_string()
@@ -183,6 +200,21 @@ fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
183
200
  }
184
201
  }
185
202
 
203
+ // WASM implementations skip dedicated stack (not supported) and process inline
204
+ #[cfg(target_arch = "wasm32")]
205
+ fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
206
+ convert_html_with_options(&html, options)
207
+ }
208
+
209
+ #[cfg(target_arch = "wasm32")]
210
+ fn convert_inline_images_with_large_stack(
211
+ html: String,
212
+ options: ConversionOptions,
213
+ image_config: LibInlineImageConfig,
214
+ ) -> Result<HtmlExtraction> {
215
+ convert_inline_images_with_options(&html, options, image_config)
216
+ }
217
+
186
218
  /// Convert HTML to markdown with optional configuration.
187
219
  ///
188
220
  /// Uses sensible defaults if no configuration is provided:
@@ -325,19 +325,33 @@ impl DocumentExtractor for PdfExtractor {
325
325
  config: &ExtractionConfig,
326
326
  ) -> Result<ExtractionResult> {
327
327
  #[cfg(feature = "pdf")]
328
- let (pdf_metadata, native_text, tables, page_contents) = if crate::core::batch_mode::is_batch_mode() {
329
- let content_owned = content.to_vec();
330
- let span = tracing::Span::current();
331
- let pages_config = config.pages.clone();
332
- tokio::task::spawn_blocking(move || {
333
- let _guard = span.entered();
334
- let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
335
- .or_else(|_| Pdfium::bind_to_system_library())
336
- .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
337
-
328
+ let (pdf_metadata, native_text, tables, page_contents) = {
329
+ // WASM target: always synchronous (no tokio::task::spawn_blocking)
330
+ // Other targets: use spawn_blocking in batch mode for better parallelism
331
+ #[cfg(target_arch = "wasm32")]
332
+ {
333
+ // SAFETY: For WASM targets, this code path should only be reached if the
334
+ // WASM environment has properly initialized PDFium. The error message
335
+ // will direct users to the documentation for setup requirements.
336
+ let bindings =
337
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")
338
+ .map_err(|pdf_err| {
339
+ // Provide context-specific error for WASM PDF failures
340
+ if pdf_err.to_string().contains("WASM") || pdf_err.to_string().contains("Module") {
341
+ crate::error::KreuzbergError::Parsing {
342
+ message: "PDF extraction requires proper WASM module initialization. \
343
+ Ensure your WASM environment is set up with PDFium support. \
344
+ See: https://docs.kreuzberg.dev/wasm/pdf"
345
+ .to_string(),
346
+ source: None,
347
+ }
348
+ } else {
349
+ pdf_err.into()
350
+ }
351
+ })?;
338
352
  let pdfium = Pdfium::new(bindings);
339
353
 
340
- let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
354
+ let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
341
355
  let err_msg = e.to_string();
342
356
  if err_msg.contains("password") || err_msg.contains("Password") {
343
357
  PdfError::PasswordRequired
@@ -347,51 +361,86 @@ impl DocumentExtractor for PdfExtractor {
347
361
  })?;
348
362
 
349
363
  let (native_text, boundaries, page_contents) =
350
- crate::pdf::text::extract_text_from_pdf_document(&document, pages_config.as_ref())?;
364
+ crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
351
365
 
352
366
  let pdf_metadata =
353
367
  crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
354
368
 
355
369
  let tables = extract_tables_from_document(&document, &pdf_metadata)?;
356
370
 
357
- if let Some(ref page_cfg) = pages_config
358
- && page_cfg.extract_pages
359
- && page_contents.is_none()
360
- {
361
- return Err(PdfError::ExtractionFailed(
362
- "Page extraction was configured but no page data was extracted in batch mode".to_string(),
363
- )
364
- .into());
365
- }
366
-
367
- Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
368
- })
369
- .await
370
- .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
371
- } else {
372
- let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
373
- .or_else(|_| Pdfium::bind_to_system_library())
374
- .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
371
+ (pdf_metadata, native_text, tables, page_contents)
372
+ }
373
+ #[cfg(not(target_arch = "wasm32"))]
374
+ {
375
+ if crate::core::batch_mode::is_batch_mode() {
376
+ let content_owned = content.to_vec();
377
+ let span = tracing::Span::current();
378
+ let pages_config = config.pages.clone();
379
+ tokio::task::spawn_blocking(move || {
380
+ let _guard = span.entered();
381
+ let bindings =
382
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
383
+
384
+ let pdfium = Pdfium::new(bindings);
385
+
386
+ let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
387
+ let err_msg = e.to_string();
388
+ if err_msg.contains("password") || err_msg.contains("Password") {
389
+ PdfError::PasswordRequired
390
+ } else {
391
+ PdfError::InvalidPdf(err_msg)
392
+ }
393
+ })?;
394
+
395
+ let (native_text, boundaries, page_contents) =
396
+ crate::pdf::text::extract_text_from_pdf_document(&document, pages_config.as_ref())?;
397
+
398
+ let pdf_metadata =
399
+ crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
400
+
401
+ let tables = extract_tables_from_document(&document, &pdf_metadata)?;
402
+
403
+ if let Some(ref page_cfg) = pages_config
404
+ && page_cfg.extract_pages
405
+ && page_contents.is_none()
406
+ {
407
+ return Err(PdfError::ExtractionFailed(
408
+ "Page extraction was configured but no page data was extracted in batch mode"
409
+ .to_string(),
410
+ )
411
+ .into());
412
+ }
413
+
414
+ Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
415
+ })
416
+ .await
417
+ .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
418
+ } else {
419
+ let bindings =
420
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
375
421
 
376
- let pdfium = Pdfium::new(bindings);
422
+ let pdfium = Pdfium::new(bindings);
377
423
 
378
- let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
379
- let err_msg = e.to_string();
380
- if err_msg.contains("password") || err_msg.contains("Password") {
381
- PdfError::PasswordRequired
382
- } else {
383
- PdfError::InvalidPdf(err_msg)
384
- }
385
- })?;
424
+ let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
425
+ let err_msg = e.to_string();
426
+ if err_msg.contains("password") || err_msg.contains("Password") {
427
+ PdfError::PasswordRequired
428
+ } else {
429
+ PdfError::InvalidPdf(err_msg)
430
+ }
431
+ })?;
386
432
 
387
- let (native_text, boundaries, page_contents) =
388
- crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
433
+ let (native_text, boundaries, page_contents) =
434
+ crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
389
435
 
390
- let pdf_metadata = crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
436
+ let pdf_metadata =
437
+ crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
391
438
 
392
- let tables = extract_tables_from_document(&document, &pdf_metadata)?;
439
+ let tables = extract_tables_from_document(&document, &pdf_metadata)?;
393
440
 
394
- (pdf_metadata, native_text, tables, page_contents)
441
+ (pdf_metadata, native_text, tables, page_contents)
442
+ }
443
+ }
395
444
  };
396
445
 
397
446
  #[cfg(feature = "ocr")]
@@ -585,7 +634,8 @@ mod tests {
585
634
  ..Default::default()
586
635
  };
587
636
 
588
- let pdf_path = "/Users/naamanhirschfeld/workspace/kreuzberg-dev/kreuzberg/fixtures/pdf/simple_text.pdf";
637
+ let pdf_path =
638
+ std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
589
639
  if let Ok(content) = std::fs::read(pdf_path) {
590
640
  let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
591
641
  assert!(
@@ -608,7 +658,8 @@ mod tests {
608
658
  let extractor = PdfExtractor::new();
609
659
  let config = ExtractionConfig::default();
610
660
 
611
- let pdf_path = "/Users/naamanhirschfeld/workspace/kreuzberg-dev/kreuzberg/fixtures/pdf/simple_text.pdf";
661
+ let pdf_path =
662
+ std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
612
663
  if let Ok(content) = std::fs::read(pdf_path) {
613
664
  let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
614
665
  assert!(
@@ -641,7 +692,8 @@ mod tests {
641
692
  ..Default::default()
642
693
  };
643
694
 
644
- let pdf_path = "/Users/naamanhirschfeld/workspace/kreuzberg-dev/kreuzberg/fixtures/pdf/simple_text.pdf";
695
+ let pdf_path =
696
+ std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/multi_page.pdf");
645
697
  if let Ok(content) = std::fs::read(pdf_path) {
646
698
  let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
647
699
  assert!(
@@ -1,5 +1,3 @@
1
- #![cfg(feature = "mcp")]
2
-
3
1
  //! Model Context Protocol (MCP) server implementation.
4
2
  //!
5
3
  //! Provides an MCP server that exposes Kreuzberg's document extraction
@@ -28,6 +26,9 @@ mod server;
28
26
 
29
27
  pub use server::{start_mcp_server, start_mcp_server_with_config};
30
28
 
29
+ #[cfg(feature = "mcp-http")]
30
+ pub use server::{start_mcp_server_http, start_mcp_server_http_with_config};
31
+
31
32
  pub use server::{BatchExtractFilesParams, DetectMimeTypeParams, ExtractBytesParams, ExtractFileParams, KreuzbergMcp};
32
33
 
33
34
  #[doc(hidden)]
@@ -12,6 +12,9 @@ use rmcp::{
12
12
  transport::stdio,
13
13
  };
14
14
 
15
+ #[cfg(feature = "mcp-http")]
16
+ use rmcp::transport::streamable_http_server::{StreamableHttpService, session::local::LocalSessionManager};
17
+
15
18
  use crate::{
16
19
  ExtractionConfig, ExtractionResult as KreuzbergResult, KreuzbergError, batch_extract_file, batch_extract_file_sync,
17
20
  cache, detect_mime_type, extract_bytes, extract_bytes_sync, extract_file, extract_file_sync,
@@ -453,6 +456,109 @@ pub async fn start_mcp_server_with_config(
453
456
  Ok(())
454
457
  }
455
458
 
459
+ /// Start MCP server with HTTP Stream transport.
460
+ ///
461
+ /// Uses rmcp's built-in StreamableHttpService for HTTP/SSE support per MCP spec.
462
+ ///
463
+ /// # Arguments
464
+ ///
465
+ /// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
466
+ /// * `port` - Port number (e.g., 8001)
467
+ ///
468
+ /// # Example
469
+ ///
470
+ /// ```no_run
471
+ /// use kreuzberg::mcp::start_mcp_server_http;
472
+ ///
473
+ /// #[tokio::main]
474
+ /// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
475
+ /// start_mcp_server_http("127.0.0.1", 8001).await?;
476
+ /// Ok(())
477
+ /// }
478
+ /// ```
479
+ #[cfg(feature = "mcp-http")]
480
+ pub async fn start_mcp_server_http(
481
+ host: impl AsRef<str>,
482
+ port: u16,
483
+ ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
484
+ use axum::Router;
485
+ use std::net::SocketAddr;
486
+
487
+ let http_service = StreamableHttpService::new(
488
+ || KreuzbergMcp::new().map_err(|e| std::io::Error::other(e.to_string())),
489
+ LocalSessionManager::default().into(),
490
+ Default::default(),
491
+ );
492
+
493
+ let router = Router::new().nest_service("/mcp", http_service);
494
+
495
+ let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
496
+ .parse()
497
+ .map_err(|e| format!("Invalid address: {}", e))?;
498
+
499
+ #[cfg(feature = "api")]
500
+ tracing::info!("Starting MCP HTTP server on http://{}", addr);
501
+
502
+ let listener = tokio::net::TcpListener::bind(addr).await?;
503
+ axum::serve(listener, router).await?;
504
+
505
+ Ok(())
506
+ }
507
+
508
+ /// Start MCP HTTP server with custom extraction config.
509
+ ///
510
+ /// This variant allows specifying a custom extraction configuration
511
+ /// while using HTTP Stream transport.
512
+ ///
513
+ /// # Arguments
514
+ ///
515
+ /// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
516
+ /// * `port` - Port number (e.g., 8001)
517
+ /// * `config` - Custom extraction configuration
518
+ ///
519
+ /// # Example
520
+ ///
521
+ /// ```no_run
522
+ /// use kreuzberg::mcp::start_mcp_server_http_with_config;
523
+ /// use kreuzberg::ExtractionConfig;
524
+ ///
525
+ /// #[tokio::main]
526
+ /// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
527
+ /// let config = ExtractionConfig::default();
528
+ /// start_mcp_server_http_with_config("127.0.0.1", 8001, config).await?;
529
+ /// Ok(())
530
+ /// }
531
+ /// ```
532
+ #[cfg(feature = "mcp-http")]
533
+ pub async fn start_mcp_server_http_with_config(
534
+ host: impl AsRef<str>,
535
+ port: u16,
536
+ config: ExtractionConfig,
537
+ ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
538
+ use axum::Router;
539
+ use std::net::SocketAddr;
540
+
541
+ let http_service = StreamableHttpService::new(
542
+ move || Ok(KreuzbergMcp::with_config(config.clone())),
543
+ LocalSessionManager::default().into(),
544
+ Default::default(),
545
+ );
546
+
547
+ let router = Router::new().nest_service("/mcp", http_service);
548
+
549
+ let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
550
+ .parse()
551
+ .map_err(|e| format!("Invalid address: {}", e))?;
552
+
553
+ #[cfg(feature = "api")]
554
+ tracing::info!("Starting MCP HTTP server on http://{}", addr);
555
+
556
+ let listener = tokio::net::TcpListener::bind(addr).await?;
557
+ axum::serve(listener, router).await?;
558
+
559
+ Ok(())
560
+ }
561
+
456
562
  /// Build extraction config from MCP parameters.
457
563
  ///
458
564
  /// Starts with the default config and overlays OCR settings from request parameters.
@@ -0,0 +1,44 @@
1
+ use super::error::PdfError;
2
+ use pdfium_render::prelude::*;
3
+
4
+ pub(crate) fn bind_pdfium(
5
+ map_err: fn(String) -> PdfError,
6
+ context: &'static str,
7
+ ) -> Result<Box<dyn PdfiumLibraryBindings>, PdfError> {
8
+ #[cfg(all(feature = "pdf", feature = "pdf-bundled"))]
9
+ {
10
+ // WASM target: use dynamic binding to WASM module
11
+ // SAFETY: pdfium-render handles WASM module lifecycle internally.
12
+ // For WASM builds, the PDFium library is linked at compile time
13
+ // and the WASM runtime manages initialization.
14
+ #[cfg(target_arch = "wasm32")]
15
+ {
16
+ Pdfium::bind_to_system_library()
17
+ .map_err(|e| map_err(format!("Failed to initialize Pdfium for WASM ({}): {}", context, e)))
18
+ }
19
+
20
+ // Non-WASM targets: extract and link dynamically
21
+ #[cfg(not(target_arch = "wasm32"))]
22
+ {
23
+ let lib_path = crate::pdf::extract_bundled_pdfium()
24
+ .map_err(|e| map_err(format!("Failed to extract bundled Pdfium ({}): {}", context, e)))?;
25
+
26
+ let lib_dir = lib_path.parent().ok_or_else(|| {
27
+ map_err(format!(
28
+ "Failed to determine Pdfium extraction directory for '{}' ({})",
29
+ lib_path.display(),
30
+ context
31
+ ))
32
+ })?;
33
+
34
+ Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(lib_dir))
35
+ .map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
36
+ }
37
+ }
38
+
39
+ #[cfg(all(feature = "pdf", not(feature = "pdf-bundled")))]
40
+ {
41
+ Pdfium::bind_to_system_library()
42
+ .map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
43
+ }
44
+ }