kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -5
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +32 -11
- data/vendor/kreuzberg/README.md +54 -8
- data/vendor/kreuzberg/build.rs +549 -132
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
- data/vendor/kreuzberg/src/mcp/server.rs +120 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/lib.rs +1 -0
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +13 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -4,8 +4,13 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::Result;
|
|
6
6
|
use crate::core::config::LanguageDetectionConfig;
|
|
7
|
+
use once_cell::sync::Lazy;
|
|
8
|
+
use std::sync::Arc;
|
|
7
9
|
use whatlang::{Lang, detect};
|
|
8
10
|
|
|
11
|
+
pub mod processor;
|
|
12
|
+
pub use processor::LanguageDetector;
|
|
13
|
+
|
|
9
14
|
/// Detect languages in text using whatlang.
|
|
10
15
|
///
|
|
11
16
|
/// Returns a list of detected language codes (ISO 639-3 format).
|
|
@@ -940,3 +945,41 @@ mod tests {
|
|
|
940
945
|
assert_eq!(langs[0], "eng");
|
|
941
946
|
}
|
|
942
947
|
}
|
|
948
|
+
|
|
949
|
+
/// Lazy-initialized flag that ensures language detection processor is registered exactly once.
|
|
950
|
+
///
|
|
951
|
+
/// This static is accessed on first use to automatically register the
|
|
952
|
+
/// language detection processor with the plugin registry.
|
|
953
|
+
static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_language_detection_processor);
|
|
954
|
+
|
|
955
|
+
/// Ensure the language detection processor is registered.
|
|
956
|
+
///
|
|
957
|
+
/// This function is called automatically when needed.
|
|
958
|
+
/// It's safe to call multiple times - registration only happens once.
|
|
959
|
+
pub fn ensure_initialized() -> Result<()> {
|
|
960
|
+
PROCESSOR_INITIALIZED
|
|
961
|
+
.as_ref()
|
|
962
|
+
.map(|_| ())
|
|
963
|
+
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
964
|
+
message: format!("Failed to register language detection processor: {}", e),
|
|
965
|
+
plugin_name: "language-detection".to_string(),
|
|
966
|
+
})
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
/// Register the language detection processor with the global registry.
|
|
970
|
+
///
|
|
971
|
+
/// This function should be called once at application startup to register
|
|
972
|
+
/// the language detection post-processor.
|
|
973
|
+
///
|
|
974
|
+
/// **Note:** This is called automatically on first use.
|
|
975
|
+
/// Explicit calling is optional.
|
|
976
|
+
pub fn register_language_detection_processor() -> Result<()> {
|
|
977
|
+
let registry = crate::plugins::registry::get_post_processor_registry();
|
|
978
|
+
let mut registry = registry
|
|
979
|
+
.write()
|
|
980
|
+
.map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
|
|
981
|
+
|
|
982
|
+
registry.register(Arc::new(LanguageDetector), 40)?;
|
|
983
|
+
|
|
984
|
+
Ok(())
|
|
985
|
+
}
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
//! Language detection post-processor.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a PostProcessor plugin that detects languages in
|
|
4
|
+
//! extraction results and stores them in the result.
|
|
5
|
+
|
|
6
|
+
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
7
|
+
use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
|
|
8
|
+
use async_trait::async_trait;
|
|
9
|
+
|
|
10
|
+
/// Post-processor that detects languages in document content.
|
|
11
|
+
///
|
|
12
|
+
/// This processor:
|
|
13
|
+
/// - Runs in the Early processing stage
|
|
14
|
+
/// - Only processes when `config.language_detection` is configured
|
|
15
|
+
/// - Stores detected languages in `result.detected_languages`
|
|
16
|
+
/// - Uses the whatlang library for detection
|
|
17
|
+
///
|
|
18
|
+
/// # Example
|
|
19
|
+
///
|
|
20
|
+
/// ```rust,no_run
|
|
21
|
+
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
22
|
+
/// use kreuzberg::language_detection::processor::LanguageDetector;
|
|
23
|
+
///
|
|
24
|
+
/// let processor = LanguageDetector;
|
|
25
|
+
/// assert_eq!(processor.name(), "language-detection");
|
|
26
|
+
/// ```
|
|
27
|
+
#[derive(Debug, Clone, Copy)]
|
|
28
|
+
pub struct LanguageDetector;
|
|
29
|
+
|
|
30
|
+
impl Plugin for LanguageDetector {
|
|
31
|
+
fn name(&self) -> &str {
|
|
32
|
+
"language-detection"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
fn version(&self) -> String {
|
|
36
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
fn initialize(&self) -> Result<()> {
|
|
40
|
+
Ok(())
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
fn shutdown(&self) -> Result<()> {
|
|
44
|
+
Ok(())
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
49
|
+
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
50
|
+
impl PostProcessor for LanguageDetector {
|
|
51
|
+
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
52
|
+
let lang_config = match &config.language_detection {
|
|
53
|
+
Some(cfg) => cfg,
|
|
54
|
+
None => return Ok(()),
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
match super::detect_languages(&result.content, lang_config)
|
|
58
|
+
.map_err(|e| KreuzbergError::Other(format!("Language detection failed: {}", e)))?
|
|
59
|
+
{
|
|
60
|
+
Some(languages) => {
|
|
61
|
+
result.detected_languages = Some(languages);
|
|
62
|
+
}
|
|
63
|
+
None => {
|
|
64
|
+
result.detected_languages = None;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
Ok(())
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
72
|
+
ProcessingStage::Early
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
76
|
+
config.language_detection.is_some()
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
|
|
80
|
+
let text_length = result.content.len();
|
|
81
|
+
// Language detection is relatively fast: ~1ms per 1KB
|
|
82
|
+
(text_length / 1024).max(1) as u64
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[cfg(test)]
|
|
87
|
+
mod tests {
|
|
88
|
+
use super::*;
|
|
89
|
+
use crate::core::config::LanguageDetectionConfig;
|
|
90
|
+
use crate::types::Metadata;
|
|
91
|
+
|
|
92
|
+
#[tokio::test]
|
|
93
|
+
async fn test_language_detector_processor() {
|
|
94
|
+
let processor = LanguageDetector;
|
|
95
|
+
let config = ExtractionConfig {
|
|
96
|
+
language_detection: Some(LanguageDetectionConfig {
|
|
97
|
+
enabled: true,
|
|
98
|
+
min_confidence: 0.8,
|
|
99
|
+
detect_multiple: false,
|
|
100
|
+
}),
|
|
101
|
+
..Default::default()
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
let mut result = ExtractionResult {
|
|
105
|
+
content: "Hello world! This is a test of the language detection system.".to_string(),
|
|
106
|
+
mime_type: "text/plain".to_string(),
|
|
107
|
+
metadata: Metadata::default(),
|
|
108
|
+
tables: vec![],
|
|
109
|
+
detected_languages: None,
|
|
110
|
+
chunks: None,
|
|
111
|
+
images: None,
|
|
112
|
+
pages: None,
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
116
|
+
|
|
117
|
+
assert!(result.detected_languages.is_some());
|
|
118
|
+
let langs = result.detected_languages.unwrap();
|
|
119
|
+
assert!(!langs.is_empty());
|
|
120
|
+
assert_eq!(langs[0], "eng");
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[tokio::test]
|
|
124
|
+
async fn test_language_detector_no_config() {
|
|
125
|
+
let processor = LanguageDetector;
|
|
126
|
+
let config = ExtractionConfig::default();
|
|
127
|
+
|
|
128
|
+
let mut result = ExtractionResult {
|
|
129
|
+
content: "Hello world!".to_string(),
|
|
130
|
+
mime_type: "text/plain".to_string(),
|
|
131
|
+
metadata: Metadata::default(),
|
|
132
|
+
tables: vec![],
|
|
133
|
+
detected_languages: None,
|
|
134
|
+
chunks: None,
|
|
135
|
+
images: None,
|
|
136
|
+
pages: None,
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
140
|
+
|
|
141
|
+
assert!(result.detected_languages.is_none());
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
#[test]
|
|
145
|
+
fn test_language_detector_plugin_interface() {
|
|
146
|
+
let processor = LanguageDetector;
|
|
147
|
+
assert_eq!(processor.name(), "language-detection");
|
|
148
|
+
assert!(!processor.version().is_empty());
|
|
149
|
+
assert!(processor.initialize().is_ok());
|
|
150
|
+
assert!(processor.shutdown().is_ok());
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[test]
|
|
154
|
+
fn test_language_detector_stage() {
|
|
155
|
+
let processor = LanguageDetector;
|
|
156
|
+
assert_eq!(processor.processing_stage(), ProcessingStage::Early);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[test]
|
|
160
|
+
fn test_language_detector_should_process() {
|
|
161
|
+
let processor = LanguageDetector;
|
|
162
|
+
|
|
163
|
+
let result = ExtractionResult {
|
|
164
|
+
content: "Sample text".to_string(),
|
|
165
|
+
mime_type: "text/plain".to_string(),
|
|
166
|
+
metadata: Metadata::default(),
|
|
167
|
+
tables: vec![],
|
|
168
|
+
detected_languages: None,
|
|
169
|
+
chunks: None,
|
|
170
|
+
images: None,
|
|
171
|
+
pages: None,
|
|
172
|
+
};
|
|
173
|
+
|
|
174
|
+
let config_with_lang = ExtractionConfig {
|
|
175
|
+
language_detection: Some(LanguageDetectionConfig {
|
|
176
|
+
enabled: true,
|
|
177
|
+
min_confidence: 0.8,
|
|
178
|
+
detect_multiple: false,
|
|
179
|
+
}),
|
|
180
|
+
..Default::default()
|
|
181
|
+
};
|
|
182
|
+
assert!(processor.should_process(&result, &config_with_lang));
|
|
183
|
+
|
|
184
|
+
let config_without_lang = ExtractionConfig::default();
|
|
185
|
+
assert!(!processor.should_process(&result, &config_without_lang));
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn test_language_detector_estimated_duration() {
|
|
190
|
+
let processor = LanguageDetector;
|
|
191
|
+
|
|
192
|
+
let short_result = ExtractionResult {
|
|
193
|
+
content: "Short".to_string(),
|
|
194
|
+
mime_type: "text/plain".to_string(),
|
|
195
|
+
metadata: Metadata::default(),
|
|
196
|
+
tables: vec![],
|
|
197
|
+
detected_languages: None,
|
|
198
|
+
chunks: None,
|
|
199
|
+
images: None,
|
|
200
|
+
pages: None,
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
let long_result = ExtractionResult {
|
|
204
|
+
content: "a".repeat(10000),
|
|
205
|
+
mime_type: "text/plain".to_string(),
|
|
206
|
+
metadata: Metadata::default(),
|
|
207
|
+
tables: vec![],
|
|
208
|
+
detected_languages: None,
|
|
209
|
+
chunks: None,
|
|
210
|
+
images: None,
|
|
211
|
+
pages: None,
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
215
|
+
let long_duration = processor.estimated_duration_ms(&long_result);
|
|
216
|
+
|
|
217
|
+
assert!(long_duration > short_duration);
|
|
218
|
+
}
|
|
219
|
+
}
|
data/vendor/kreuzberg/src/lib.rs
CHANGED
|
@@ -84,13 +84,21 @@ pub use types::*;
|
|
|
84
84
|
pub use core::extractor::{batch_extract_bytes, batch_extract_file};
|
|
85
85
|
pub use core::extractor::{extract_bytes, extract_file};
|
|
86
86
|
|
|
87
|
-
|
|
87
|
+
// Available in WASM (bytes-based)
|
|
88
|
+
pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
|
|
89
|
+
|
|
90
|
+
// Only available with filesystem access
|
|
91
|
+
#[cfg(feature = "tokio-runtime")]
|
|
92
|
+
pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
|
|
88
93
|
|
|
89
94
|
pub use core::config::{
|
|
90
95
|
ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
|
|
91
|
-
LanguageDetectionConfig, OcrConfig,
|
|
96
|
+
LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
|
|
92
97
|
};
|
|
93
98
|
|
|
99
|
+
#[cfg(feature = "pdf")]
|
|
100
|
+
pub use core::config::PdfConfig;
|
|
101
|
+
|
|
94
102
|
pub use core::mime::{
|
|
95
103
|
DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,
|
|
96
104
|
PLAIN_TEXT_MIME_TYPE, POWER_POINT_MIME_TYPE, XML_MIME_TYPE, detect_mime_type, detect_mime_type_from_bytes,
|
|
@@ -26,6 +26,9 @@ mod server;
|
|
|
26
26
|
|
|
27
27
|
pub use server::{start_mcp_server, start_mcp_server_with_config};
|
|
28
28
|
|
|
29
|
+
#[cfg(feature = "mcp-http")]
|
|
30
|
+
pub use server::{start_mcp_server_http, start_mcp_server_http_with_config};
|
|
31
|
+
|
|
29
32
|
pub use server::{BatchExtractFilesParams, DetectMimeTypeParams, ExtractBytesParams, ExtractFileParams, KreuzbergMcp};
|
|
30
33
|
|
|
31
34
|
#[doc(hidden)]
|
|
@@ -12,6 +12,9 @@ use rmcp::{
|
|
|
12
12
|
transport::stdio,
|
|
13
13
|
};
|
|
14
14
|
|
|
15
|
+
#[cfg(feature = "mcp-http")]
|
|
16
|
+
use rmcp::transport::streamable_http_server::{StreamableHttpService, session::local::LocalSessionManager};
|
|
17
|
+
|
|
15
18
|
use crate::{
|
|
16
19
|
ExtractionConfig, ExtractionResult as KreuzbergResult, KreuzbergError, batch_extract_file, batch_extract_file_sync,
|
|
17
20
|
cache, detect_mime_type, extract_bytes, extract_bytes_sync, extract_file, extract_file_sync,
|
|
@@ -453,6 +456,109 @@ pub async fn start_mcp_server_with_config(
|
|
|
453
456
|
Ok(())
|
|
454
457
|
}
|
|
455
458
|
|
|
459
|
+
/// Start MCP server with HTTP Stream transport.
|
|
460
|
+
///
|
|
461
|
+
/// Uses rmcp's built-in StreamableHttpService for HTTP/SSE support per MCP spec.
|
|
462
|
+
///
|
|
463
|
+
/// # Arguments
|
|
464
|
+
///
|
|
465
|
+
/// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
466
|
+
/// * `port` - Port number (e.g., 8001)
|
|
467
|
+
///
|
|
468
|
+
/// # Example
|
|
469
|
+
///
|
|
470
|
+
/// ```no_run
|
|
471
|
+
/// use kreuzberg::mcp::start_mcp_server_http;
|
|
472
|
+
///
|
|
473
|
+
/// #[tokio::main]
|
|
474
|
+
/// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
475
|
+
/// start_mcp_server_http("127.0.0.1", 8001).await?;
|
|
476
|
+
/// Ok(())
|
|
477
|
+
/// }
|
|
478
|
+
/// ```
|
|
479
|
+
#[cfg(feature = "mcp-http")]
|
|
480
|
+
pub async fn start_mcp_server_http(
|
|
481
|
+
host: impl AsRef<str>,
|
|
482
|
+
port: u16,
|
|
483
|
+
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
484
|
+
use axum::Router;
|
|
485
|
+
use std::net::SocketAddr;
|
|
486
|
+
|
|
487
|
+
let http_service = StreamableHttpService::new(
|
|
488
|
+
|| KreuzbergMcp::new().map_err(|e| std::io::Error::other(e.to_string())),
|
|
489
|
+
LocalSessionManager::default().into(),
|
|
490
|
+
Default::default(),
|
|
491
|
+
);
|
|
492
|
+
|
|
493
|
+
let router = Router::new().nest_service("/mcp", http_service);
|
|
494
|
+
|
|
495
|
+
let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
|
|
496
|
+
.parse()
|
|
497
|
+
.map_err(|e| format!("Invalid address: {}", e))?;
|
|
498
|
+
|
|
499
|
+
#[cfg(feature = "api")]
|
|
500
|
+
tracing::info!("Starting MCP HTTP server on http://{}", addr);
|
|
501
|
+
|
|
502
|
+
let listener = tokio::net::TcpListener::bind(addr).await?;
|
|
503
|
+
axum::serve(listener, router).await?;
|
|
504
|
+
|
|
505
|
+
Ok(())
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
/// Start MCP HTTP server with custom extraction config.
|
|
509
|
+
///
|
|
510
|
+
/// This variant allows specifying a custom extraction configuration
|
|
511
|
+
/// while using HTTP Stream transport.
|
|
512
|
+
///
|
|
513
|
+
/// # Arguments
|
|
514
|
+
///
|
|
515
|
+
/// * `host` - Host to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
516
|
+
/// * `port` - Port number (e.g., 8001)
|
|
517
|
+
/// * `config` - Custom extraction configuration
|
|
518
|
+
///
|
|
519
|
+
/// # Example
|
|
520
|
+
///
|
|
521
|
+
/// ```no_run
|
|
522
|
+
/// use kreuzberg::mcp::start_mcp_server_http_with_config;
|
|
523
|
+
/// use kreuzberg::ExtractionConfig;
|
|
524
|
+
///
|
|
525
|
+
/// #[tokio::main]
|
|
526
|
+
/// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
527
|
+
/// let config = ExtractionConfig::default();
|
|
528
|
+
/// start_mcp_server_http_with_config("127.0.0.1", 8001, config).await?;
|
|
529
|
+
/// Ok(())
|
|
530
|
+
/// }
|
|
531
|
+
/// ```
|
|
532
|
+
#[cfg(feature = "mcp-http")]
|
|
533
|
+
pub async fn start_mcp_server_http_with_config(
|
|
534
|
+
host: impl AsRef<str>,
|
|
535
|
+
port: u16,
|
|
536
|
+
config: ExtractionConfig,
|
|
537
|
+
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
538
|
+
use axum::Router;
|
|
539
|
+
use std::net::SocketAddr;
|
|
540
|
+
|
|
541
|
+
let http_service = StreamableHttpService::new(
|
|
542
|
+
move || Ok(KreuzbergMcp::with_config(config.clone())),
|
|
543
|
+
LocalSessionManager::default().into(),
|
|
544
|
+
Default::default(),
|
|
545
|
+
);
|
|
546
|
+
|
|
547
|
+
let router = Router::new().nest_service("/mcp", http_service);
|
|
548
|
+
|
|
549
|
+
let addr: SocketAddr = format!("{}:{}", host.as_ref(), port)
|
|
550
|
+
.parse()
|
|
551
|
+
.map_err(|e| format!("Invalid address: {}", e))?;
|
|
552
|
+
|
|
553
|
+
#[cfg(feature = "api")]
|
|
554
|
+
tracing::info!("Starting MCP HTTP server on http://{}", addr);
|
|
555
|
+
|
|
556
|
+
let listener = tokio::net::TcpListener::bind(addr).await?;
|
|
557
|
+
axum::serve(listener, router).await?;
|
|
558
|
+
|
|
559
|
+
Ok(())
|
|
560
|
+
}
|
|
561
|
+
|
|
456
562
|
/// Build extraction config from MCP parameters.
|
|
457
563
|
///
|
|
458
564
|
/// Starts with the default config and overlays OCR settings from request parameters.
|
|
@@ -750,6 +856,7 @@ mod tests {
|
|
|
750
856
|
detected_languages: None,
|
|
751
857
|
chunks: None,
|
|
752
858
|
images: None,
|
|
859
|
+
pages: None,
|
|
753
860
|
};
|
|
754
861
|
|
|
755
862
|
let formatted = format_extraction_result(&result);
|
|
@@ -786,6 +893,7 @@ mod tests {
|
|
|
786
893
|
detected_languages: None,
|
|
787
894
|
chunks: None,
|
|
788
895
|
images: None,
|
|
896
|
+
pages: None,
|
|
789
897
|
};
|
|
790
898
|
|
|
791
899
|
let formatted = format_extraction_result(&result);
|
|
@@ -807,6 +915,7 @@ mod tests {
|
|
|
807
915
|
detected_languages: None,
|
|
808
916
|
chunks: None,
|
|
809
917
|
images: None,
|
|
918
|
+
pages: None,
|
|
810
919
|
};
|
|
811
920
|
|
|
812
921
|
let formatted = format_extraction_result(&result);
|
|
@@ -825,6 +934,7 @@ mod tests {
|
|
|
825
934
|
detected_languages: None,
|
|
826
935
|
chunks: None,
|
|
827
936
|
images: None,
|
|
937
|
+
pages: None,
|
|
828
938
|
};
|
|
829
939
|
|
|
830
940
|
let formatted = format_extraction_result(&result);
|
|
@@ -1622,19 +1732,17 @@ mod tests {
|
|
|
1622
1732
|
|
|
1623
1733
|
let result = server.batch_extract_files(Parameters(params)).await;
|
|
1624
1734
|
|
|
1625
|
-
if
|
|
1626
|
-
let
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
assert!(text.text.contains("Document 2"));
|
|
1735
|
+
if let Ok(call_result) = result
|
|
1736
|
+
&& let Some(content) = call_result.content.first()
|
|
1737
|
+
&& let RawContent::Text(text) = &content.raw
|
|
1738
|
+
{
|
|
1739
|
+
assert!(text.text.contains("Document 1"));
|
|
1740
|
+
assert!(text.text.contains("Document 2"));
|
|
1632
1741
|
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
}
|
|
1742
|
+
let doc1_pos = text.text.find("Document 1");
|
|
1743
|
+
let doc2_pos = text.text.find("Document 2");
|
|
1744
|
+
if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
|
|
1745
|
+
assert!(pos1 < pos2, "Documents should be in order");
|
|
1638
1746
|
}
|
|
1639
1747
|
}
|
|
1640
1748
|
}
|
|
@@ -161,6 +161,7 @@ impl OcrBackend for TesseractBackend {
|
|
|
161
161
|
content: ocr_result.content,
|
|
162
162
|
mime_type: ocr_result.mime_type,
|
|
163
163
|
metadata,
|
|
164
|
+
pages: None,
|
|
164
165
|
tables: ocr_result
|
|
165
166
|
.tables
|
|
166
167
|
.into_iter()
|
|
@@ -214,6 +215,7 @@ impl OcrBackend for TesseractBackend {
|
|
|
214
215
|
content: ocr_result.content,
|
|
215
216
|
mime_type: ocr_result.mime_type,
|
|
216
217
|
metadata,
|
|
218
|
+
pages: None,
|
|
217
219
|
tables: ocr_result
|
|
218
220
|
.tables
|
|
219
221
|
.into_iter()
|