kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -4,8 +4,13 @@
4
4
 
5
5
  use crate::Result;
6
6
  use crate::core::config::LanguageDetectionConfig;
7
+ use once_cell::sync::Lazy;
8
+ use std::sync::Arc;
7
9
  use whatlang::{Lang, detect};
8
10
 
11
+ pub mod processor;
12
+ pub use processor::LanguageDetector;
13
+
9
14
  /// Detect languages in text using whatlang.
10
15
  ///
11
16
  /// Returns a list of detected language codes (ISO 639-3 format).
@@ -940,3 +945,41 @@ mod tests {
940
945
  assert_eq!(langs[0], "eng");
941
946
  }
942
947
  }
948
+
949
+ /// Lazy-initialized flag that ensures language detection processor is registered exactly once.
950
+ ///
951
+ /// This static is accessed on first use to automatically register the
952
+ /// language detection processor with the plugin registry.
953
+ static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_language_detection_processor);
954
+
955
+ /// Ensure the language detection processor is registered.
956
+ ///
957
+ /// This function is called automatically when needed.
958
+ /// It's safe to call multiple times - registration only happens once.
959
+ pub fn ensure_initialized() -> Result<()> {
960
+ PROCESSOR_INITIALIZED
961
+ .as_ref()
962
+ .map(|_| ())
963
+ .map_err(|e| crate::KreuzbergError::Plugin {
964
+ message: format!("Failed to register language detection processor: {}", e),
965
+ plugin_name: "language-detection".to_string(),
966
+ })
967
+ }
968
+
969
+ /// Register the language detection processor with the global registry.
970
+ ///
971
+ /// This function should be called once at application startup to register
972
+ /// the language detection post-processor.
973
+ ///
974
+ /// **Note:** This is called automatically on first use.
975
+ /// Explicit calling is optional.
976
+ pub fn register_language_detection_processor() -> Result<()> {
977
+ let registry = crate::plugins::registry::get_post_processor_registry();
978
+ let mut registry = registry
979
+ .write()
980
+ .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
981
+
982
+ registry.register(Arc::new(LanguageDetector), 40)?;
983
+
984
+ Ok(())
985
+ }
@@ -0,0 +1,219 @@
1
+ //! Language detection post-processor.
2
+ //!
3
+ //! This module provides a PostProcessor plugin that detects languages in
4
+ //! extraction results and stores them in the result.
5
+
6
+ use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
7
+ use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
8
+ use async_trait::async_trait;
9
+
10
+ /// Post-processor that detects languages in document content.
11
+ ///
12
+ /// This processor:
13
+ /// - Runs in the Early processing stage
14
+ /// - Only processes when `config.language_detection` is configured
15
+ /// - Stores detected languages in `result.detected_languages`
16
+ /// - Uses the whatlang library for detection
17
+ ///
18
+ /// # Example
19
+ ///
20
+ /// ```rust,no_run
21
+ /// use kreuzberg::plugins::{Plugin, PostProcessor};
22
+ /// use kreuzberg::language_detection::processor::LanguageDetector;
23
+ ///
24
+ /// let processor = LanguageDetector;
25
+ /// assert_eq!(processor.name(), "language-detection");
26
+ /// ```
27
+ #[derive(Debug, Clone, Copy)]
28
+ pub struct LanguageDetector;
29
+
30
+ impl Plugin for LanguageDetector {
31
+ fn name(&self) -> &str {
32
+ "language-detection"
33
+ }
34
+
35
+ fn version(&self) -> String {
36
+ env!("CARGO_PKG_VERSION").to_string()
37
+ }
38
+
39
+ fn initialize(&self) -> Result<()> {
40
+ Ok(())
41
+ }
42
+
43
+ fn shutdown(&self) -> Result<()> {
44
+ Ok(())
45
+ }
46
+ }
47
+
48
+ #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
49
+ #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
50
+ impl PostProcessor for LanguageDetector {
51
+ async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
52
+ let lang_config = match &config.language_detection {
53
+ Some(cfg) => cfg,
54
+ None => return Ok(()),
55
+ };
56
+
57
+ match super::detect_languages(&result.content, lang_config)
58
+ .map_err(|e| KreuzbergError::Other(format!("Language detection failed: {}", e)))?
59
+ {
60
+ Some(languages) => {
61
+ result.detected_languages = Some(languages);
62
+ }
63
+ None => {
64
+ result.detected_languages = None;
65
+ }
66
+ }
67
+
68
+ Ok(())
69
+ }
70
+
71
+ fn processing_stage(&self) -> ProcessingStage {
72
+ ProcessingStage::Early
73
+ }
74
+
75
+ fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
76
+ config.language_detection.is_some()
77
+ }
78
+
79
+ fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
80
+ let text_length = result.content.len();
81
+ // Language detection is relatively fast: ~1ms per 1KB
82
+ (text_length / 1024).max(1) as u64
83
+ }
84
+ }
85
+
86
+ #[cfg(test)]
87
+ mod tests {
88
+ use super::*;
89
+ use crate::core::config::LanguageDetectionConfig;
90
+ use crate::types::Metadata;
91
+
92
+ #[tokio::test]
93
+ async fn test_language_detector_processor() {
94
+ let processor = LanguageDetector;
95
+ let config = ExtractionConfig {
96
+ language_detection: Some(LanguageDetectionConfig {
97
+ enabled: true,
98
+ min_confidence: 0.8,
99
+ detect_multiple: false,
100
+ }),
101
+ ..Default::default()
102
+ };
103
+
104
+ let mut result = ExtractionResult {
105
+ content: "Hello world! This is a test of the language detection system.".to_string(),
106
+ mime_type: "text/plain".to_string(),
107
+ metadata: Metadata::default(),
108
+ tables: vec![],
109
+ detected_languages: None,
110
+ chunks: None,
111
+ images: None,
112
+ pages: None,
113
+ };
114
+
115
+ processor.process(&mut result, &config).await.unwrap();
116
+
117
+ assert!(result.detected_languages.is_some());
118
+ let langs = result.detected_languages.unwrap();
119
+ assert!(!langs.is_empty());
120
+ assert_eq!(langs[0], "eng");
121
+ }
122
+
123
+ #[tokio::test]
124
+ async fn test_language_detector_no_config() {
125
+ let processor = LanguageDetector;
126
+ let config = ExtractionConfig::default();
127
+
128
+ let mut result = ExtractionResult {
129
+ content: "Hello world!".to_string(),
130
+ mime_type: "text/plain".to_string(),
131
+ metadata: Metadata::default(),
132
+ tables: vec![],
133
+ detected_languages: None,
134
+ chunks: None,
135
+ images: None,
136
+ pages: None,
137
+ };
138
+
139
+ processor.process(&mut result, &config).await.unwrap();
140
+
141
+ assert!(result.detected_languages.is_none());
142
+ }
143
+
144
+ #[test]
145
+ fn test_language_detector_plugin_interface() {
146
+ let processor = LanguageDetector;
147
+ assert_eq!(processor.name(), "language-detection");
148
+ assert!(!processor.version().is_empty());
149
+ assert!(processor.initialize().is_ok());
150
+ assert!(processor.shutdown().is_ok());
151
+ }
152
+
153
+ #[test]
154
+ fn test_language_detector_stage() {
155
+ let processor = LanguageDetector;
156
+ assert_eq!(processor.processing_stage(), ProcessingStage::Early);
157
+ }
158
+
159
+ #[test]
160
+ fn test_language_detector_should_process() {
161
+ let processor = LanguageDetector;
162
+
163
+ let result = ExtractionResult {
164
+ content: "Sample text".to_string(),
165
+ mime_type: "text/plain".to_string(),
166
+ metadata: Metadata::default(),
167
+ tables: vec![],
168
+ detected_languages: None,
169
+ chunks: None,
170
+ images: None,
171
+ pages: None,
172
+ };
173
+
174
+ let config_with_lang = ExtractionConfig {
175
+ language_detection: Some(LanguageDetectionConfig {
176
+ enabled: true,
177
+ min_confidence: 0.8,
178
+ detect_multiple: false,
179
+ }),
180
+ ..Default::default()
181
+ };
182
+ assert!(processor.should_process(&result, &config_with_lang));
183
+
184
+ let config_without_lang = ExtractionConfig::default();
185
+ assert!(!processor.should_process(&result, &config_without_lang));
186
+ }
187
+
188
+ #[test]
189
+ fn test_language_detector_estimated_duration() {
190
+ let processor = LanguageDetector;
191
+
192
+ let short_result = ExtractionResult {
193
+ content: "Short".to_string(),
194
+ mime_type: "text/plain".to_string(),
195
+ metadata: Metadata::default(),
196
+ tables: vec![],
197
+ detected_languages: None,
198
+ chunks: None,
199
+ images: None,
200
+ pages: None,
201
+ };
202
+
203
+ let long_result = ExtractionResult {
204
+ content: "a".repeat(10000),
205
+ mime_type: "text/plain".to_string(),
206
+ metadata: Metadata::default(),
207
+ tables: vec![],
208
+ detected_languages: None,
209
+ chunks: None,
210
+ images: None,
211
+ pages: None,
212
+ };
213
+
214
+ let short_duration = processor.estimated_duration_ms(&short_result);
215
+ let long_duration = processor.estimated_duration_ms(&long_result);
216
+
217
+ assert!(long_duration > short_duration);
218
+ }
219
+ }
@@ -84,13 +84,21 @@ pub use types::*;
84
84
  pub use core::extractor::{batch_extract_bytes, batch_extract_file};
85
85
  pub use core::extractor::{extract_bytes, extract_file};
86
86
 
87
- pub use core::extractor::{batch_extract_bytes_sync, batch_extract_file_sync, extract_bytes_sync, extract_file_sync};
87
+ // Available in WASM (bytes-based)
88
+ pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
89
+
90
+ // Only available with filesystem access
91
+ #[cfg(feature = "tokio-runtime")]
92
+ pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
88
93
 
89
94
  pub use core::config::{
90
95
  ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
91
- LanguageDetectionConfig, OcrConfig, PdfConfig, PostProcessorConfig, TokenReductionConfig,
96
+ LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
92
97
  };
93
98
 
99
+ #[cfg(feature = "pdf")]
100
+ pub use core::config::PdfConfig;
101
+
94
102
  pub use core::mime::{
95
103
  DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,
96
104
  PLAIN_TEXT_MIME_TYPE, POWER_POINT_MIME_TYPE, XML_MIME_TYPE, detect_mime_type, detect_mime_type_from_bytes,
@@ -26,6 +26,9 @@ mod server;
26
26
 
27
27
  pub use server::{start_mcp_server, start_mcp_server_with_config};
28
28
 
29
+ #[cfg(feature = "mcp-http")]
30
+ pub use server::{start_mcp_server_http, start_mcp_server_http_with_config};
31
+
29
32
  pub use server::{BatchExtractFilesParams, DetectMimeTypeParams, ExtractBytesParams, ExtractFileParams, KreuzbergMcp};
30
33
 
31
34
  #[doc(hidden)]
@@ -12,6 +12,9 @@ use rmcp::{
12
12
  transport::stdio,
13
13
  };
14
14
 
15
+ #[cfg(feature = "mcp-http")]
16
+ use rmcp::transport::streamable_http_server::{StreamableHttpService, session::local::LocalSessionManager};
17
+
15
18
  use crate::{
16
19
  ExtractionConfig, ExtractionResult as KreuzbergResult, KreuzbergError, batch_extract_file, batch_extract_file_sync,
17
20
  cache, detect_mime_type, extract_bytes, extract_bytes_sync, extract_file, extract_file_sync,
@@ -453,6 +456,109 @@ pub async fn start_mcp_server_with_config(
453
456
  Ok(())
454
457
  }
455
458
 
459
+ /// Start MCP server with HTTP Stream transport.
460
+ ///
461
+ /// Uses rmcp's built-in StreamableHttpService for HTTP/SSE support per MCP spec.
462
+ ///
463
+ /// # Arguments
464
+ ///
465
+ /// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
466
+ /// * `port` - Port number (e.g., 8001)
467
+ ///
468
+ /// # Example
469
+ ///
470
+ /// ```no_run
471
+ /// use kreuzberg::mcp::start_mcp_server_http;
472
+ ///
473
+ /// #[tokio::main]
474
+ /// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
475
+ /// start_mcp_server_http("127.0.0.1", 8001).await?;
476
+ /// Ok(())
477
+ /// }
478
+ /// ```
479
+ #[cfg(feature = "mcp-http")]
480
+ pub async fn start_mcp_server_http(
481
+ host: impl AsRef<str>,
482
+ port: u16,
483
+ ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
484
+ use axum::Router;
485
+ use std::net::SocketAddr;
486
+
487
+ let http_service = StreamableHttpService::new(
488
+ || KreuzbergMcp::new().map_err(|e| std::io::Error::other(e.to_string())),
489
+ LocalSessionManager::default().into(),
490
+ Default::default(),
491
+ );
492
+
493
+ let router = Router::new().nest_service("/mcp", http_service);
494
+
495
+ let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
496
+ .parse()
497
+ .map_err(|e| format!("Invalid address: {}", e))?;
498
+
499
+ #[cfg(feature = "api")]
500
+ tracing::info!("Starting MCP HTTP server on http://{}", addr);
501
+
502
+ let listener = tokio::net::TcpListener::bind(addr).await?;
503
+ axum::serve(listener, router).await?;
504
+
505
+ Ok(())
506
+ }
507
+
508
+ /// Start MCP HTTP server with custom extraction config.
509
+ ///
510
+ /// This variant allows specifying a custom extraction configuration
511
+ /// while using HTTP Stream transport.
512
+ ///
513
+ /// # Arguments
514
+ ///
515
+ /// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
516
+ /// * `port` - Port number (e.g., 8001)
517
+ /// * `config` - Custom extraction configuration
518
+ ///
519
+ /// # Example
520
+ ///
521
+ /// ```no_run
522
+ /// use kreuzberg::mcp::start_mcp_server_http_with_config;
523
+ /// use kreuzberg::ExtractionConfig;
524
+ ///
525
+ /// #[tokio::main]
526
+ /// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
527
+ /// let config = ExtractionConfig::default();
528
+ /// start_mcp_server_http_with_config("127.0.0.1", 8001, config).await?;
529
+ /// Ok(())
530
+ /// }
531
+ /// ```
532
+ #[cfg(feature = "mcp-http")]
533
+ pub async fn start_mcp_server_http_with_config(
534
+ host: impl AsRef<str>,
535
+ port: u16,
536
+ config: ExtractionConfig,
537
+ ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
538
+ use axum::Router;
539
+ use std::net::SocketAddr;
540
+
541
+ let http_service = StreamableHttpService::new(
542
+ move || Ok(KreuzbergMcp::with_config(config.clone())),
543
+ LocalSessionManager::default().into(),
544
+ Default::default(),
545
+ );
546
+
547
+ let router = Router::new().nest_service("/mcp", http_service);
548
+
549
+ let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
550
+ .parse()
551
+ .map_err(|e| format!("Invalid address: {}", e))?;
552
+
553
+ #[cfg(feature = "api")]
554
+ tracing::info!("Starting MCP HTTP server on http://{}", addr);
555
+
556
+ let listener = tokio::net::TcpListener::bind(addr).await?;
557
+ axum::serve(listener, router).await?;
558
+
559
+ Ok(())
560
+ }
561
+
456
562
  /// Build extraction config from MCP parameters.
457
563
  ///
458
564
  /// Starts with the default config and overlays OCR settings from request parameters.
@@ -750,6 +856,7 @@ mod tests {
750
856
  detected_languages: None,
751
857
  chunks: None,
752
858
  images: None,
859
+ pages: None,
753
860
  };
754
861
 
755
862
  let formatted = format_extraction_result(&result);
@@ -786,6 +893,7 @@ mod tests {
786
893
  detected_languages: None,
787
894
  chunks: None,
788
895
  images: None,
896
+ pages: None,
789
897
  };
790
898
 
791
899
  let formatted = format_extraction_result(&result);
@@ -807,6 +915,7 @@ mod tests {
807
915
  detected_languages: None,
808
916
  chunks: None,
809
917
  images: None,
918
+ pages: None,
810
919
  };
811
920
 
812
921
  let formatted = format_extraction_result(&result);
@@ -825,6 +934,7 @@ mod tests {
825
934
  detected_languages: None,
826
935
  chunks: None,
827
936
  images: None,
937
+ pages: None,
828
938
  };
829
939
 
830
940
  let formatted = format_extraction_result(&result);
@@ -1622,19 +1732,17 @@ mod tests {
1622
1732
 
1623
1733
  let result = server.batch_extract_files(Parameters(params)).await;
1624
1734
 
1625
- if result.is_ok() {
1626
- let call_result = result.unwrap();
1627
- if let Some(content) = call_result.content.first()
1628
- && let RawContent::Text(text) = &content.raw
1629
- {
1630
- assert!(text.text.contains("Document 1"));
1631
- assert!(text.text.contains("Document 2"));
1735
+ if let Ok(call_result) = result
1736
+ && let Some(content) = call_result.content.first()
1737
+ && let RawContent::Text(text) = &content.raw
1738
+ {
1739
+ assert!(text.text.contains("Document 1"));
1740
+ assert!(text.text.contains("Document 2"));
1632
1741
 
1633
- let doc1_pos = text.text.find("Document 1");
1634
- let doc2_pos = text.text.find("Document 2");
1635
- if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
1636
- assert!(pos1 < pos2, "Documents should be in order");
1637
- }
1742
+ let doc1_pos = text.text.find("Document 1");
1743
+ let doc2_pos = text.text.find("Document 2");
1744
+ if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
1745
+ assert!(pos1 < pos2, "Documents should be in order");
1638
1746
  }
1639
1747
  }
1640
1748
  }
@@ -161,6 +161,7 @@ impl OcrBackend for TesseractBackend {
161
161
  content: ocr_result.content,
162
162
  mime_type: ocr_result.mime_type,
163
163
  metadata,
164
+ pages: None,
164
165
  tables: ocr_result
165
166
  .tables
166
167
  .into_iter()
@@ -214,6 +215,7 @@ impl OcrBackend for TesseractBackend {
214
215
  content: ocr_result.content,
215
216
  mime_type: ocr_result.mime_type,
216
217
  metadata,
218
+ pages: None,
217
219
  tables: ocr_result
218
220
  .tables
219
221
  .into_iter()