kreuzberg 4.1.2 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +121 -39
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +28 -12
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +57 -57
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +12 -2
|
@@ -80,7 +80,8 @@ impl KreuzbergMcp {
|
|
|
80
80
|
use super::format::{build_config, format_extraction_result};
|
|
81
81
|
use crate::{extract_file, extract_file_sync};
|
|
82
82
|
|
|
83
|
-
let config =
|
|
83
|
+
let config =
|
|
84
|
+
build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
|
|
84
85
|
|
|
85
86
|
let result = if params.r#async {
|
|
86
87
|
extract_file(¶ms.path, params.mime_type.as_deref(), &config)
|
|
@@ -114,7 +115,8 @@ impl KreuzbergMcp {
|
|
|
114
115
|
.decode(¶ms.data)
|
|
115
116
|
.map_err(|e| rmcp::ErrorData::invalid_params(format!("Invalid base64: {}", e), None))?;
|
|
116
117
|
|
|
117
|
-
let config =
|
|
118
|
+
let config =
|
|
119
|
+
build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
|
|
118
120
|
|
|
119
121
|
let mime_type = params.mime_type.as_deref().unwrap_or("");
|
|
120
122
|
|
|
@@ -145,7 +147,8 @@ impl KreuzbergMcp {
|
|
|
145
147
|
use super::format::{build_config, format_extraction_result};
|
|
146
148
|
use crate::{batch_extract_file, batch_extract_file_sync};
|
|
147
149
|
|
|
148
|
-
let config =
|
|
150
|
+
let config =
|
|
151
|
+
build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
|
|
149
152
|
|
|
150
153
|
let results = if params.r#async {
|
|
151
154
|
batch_extract_file(params.paths.clone(), &config)
|
|
@@ -30,7 +30,8 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
30
30
|
&self,
|
|
31
31
|
Parameters(params): Parameters<ExtractFileParams>,
|
|
32
32
|
) -> Result<CallToolResult, McpError> {
|
|
33
|
-
let config = build_config(self.default_config(), params.
|
|
33
|
+
let config = build_config(self.default_config(), params.config)
|
|
34
|
+
.map_err(|e| McpError::invalid_params(e, None))?;
|
|
34
35
|
|
|
35
36
|
let result = if params.r#async {
|
|
36
37
|
extract_file(¶ms.path, params.mime_type.as_deref(), &config)
|
|
@@ -59,7 +60,8 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
59
60
|
.decode(¶ms.data)
|
|
60
61
|
.map_err(|e| McpError::invalid_params(format!("Invalid base64: {}", e), None))?;
|
|
61
62
|
|
|
62
|
-
let config = build_config(self.default_config(), params.
|
|
63
|
+
let config = build_config(self.default_config(), params.config)
|
|
64
|
+
.map_err(|e| McpError::invalid_params(e, None))?;
|
|
63
65
|
|
|
64
66
|
let mime_type = params.mime_type.as_deref().unwrap_or("");
|
|
65
67
|
|
|
@@ -86,7 +88,8 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
86
88
|
&self,
|
|
87
89
|
Parameters(params): Parameters<BatchExtractFilesParams>,
|
|
88
90
|
) -> Result<CallToolResult, McpError> {
|
|
89
|
-
let config = build_config(self.default_config(), params.
|
|
91
|
+
let config = build_config(self.default_config(), params.config)
|
|
92
|
+
.map_err(|e| McpError::invalid_params(e, None))?;
|
|
90
93
|
|
|
91
94
|
let results = if params.r#async {
|
|
92
95
|
batch_extract_file(params.paths.clone(), &config)
|
|
@@ -153,8 +156,7 @@ mod tests {
|
|
|
153
156
|
let params = ExtractFileParams {
|
|
154
157
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
155
158
|
mime_type: None,
|
|
156
|
-
|
|
157
|
-
force_ocr: false,
|
|
159
|
+
config: None,
|
|
158
160
|
r#async: true,
|
|
159
161
|
};
|
|
160
162
|
|
|
@@ -181,8 +183,7 @@ mod tests {
|
|
|
181
183
|
let params = ExtractFileParams {
|
|
182
184
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
183
185
|
mime_type: None,
|
|
184
|
-
|
|
185
|
-
force_ocr: false,
|
|
186
|
+
config: None,
|
|
186
187
|
r#async: true,
|
|
187
188
|
};
|
|
188
189
|
|
|
@@ -208,8 +209,7 @@ mod tests {
|
|
|
208
209
|
let params = ExtractFileParams {
|
|
209
210
|
path: "/nonexistent/file.pdf".to_string(),
|
|
210
211
|
mime_type: None,
|
|
211
|
-
|
|
212
|
-
force_ocr: false,
|
|
212
|
+
config: None,
|
|
213
213
|
r#async: true,
|
|
214
214
|
};
|
|
215
215
|
|
|
@@ -226,8 +226,7 @@ mod tests {
|
|
|
226
226
|
let params = ExtractFileParams {
|
|
227
227
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
228
228
|
mime_type: Some("application/pdf".to_string()),
|
|
229
|
-
|
|
230
|
-
force_ocr: false,
|
|
229
|
+
config: None,
|
|
231
230
|
r#async: true,
|
|
232
231
|
};
|
|
233
232
|
|
|
@@ -246,8 +245,7 @@ mod tests {
|
|
|
246
245
|
let params = ExtractBytesParams {
|
|
247
246
|
data: encoded,
|
|
248
247
|
mime_type: Some("text/plain".to_string()),
|
|
249
|
-
|
|
250
|
-
force_ocr: false,
|
|
248
|
+
config: None,
|
|
251
249
|
r#async: true,
|
|
252
250
|
};
|
|
253
251
|
|
|
@@ -274,8 +272,7 @@ mod tests {
|
|
|
274
272
|
let params = ExtractBytesParams {
|
|
275
273
|
data: "not-valid-base64!!!".to_string(),
|
|
276
274
|
mime_type: None,
|
|
277
|
-
|
|
278
|
-
force_ocr: false,
|
|
275
|
+
config: None,
|
|
279
276
|
r#async: true,
|
|
280
277
|
};
|
|
281
278
|
|
|
@@ -292,8 +289,7 @@ mod tests {
|
|
|
292
289
|
let server = TestMcpServer::new();
|
|
293
290
|
let params = BatchExtractFilesParams {
|
|
294
291
|
paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
|
|
295
|
-
|
|
296
|
-
force_ocr: false,
|
|
292
|
+
config: None,
|
|
297
293
|
r#async: true,
|
|
298
294
|
};
|
|
299
295
|
|
|
@@ -319,8 +315,7 @@ mod tests {
|
|
|
319
315
|
let server = TestMcpServer::new();
|
|
320
316
|
let params = BatchExtractFilesParams {
|
|
321
317
|
paths: vec![],
|
|
322
|
-
|
|
323
|
-
force_ocr: false,
|
|
318
|
+
config: None,
|
|
324
319
|
r#async: true,
|
|
325
320
|
};
|
|
326
321
|
|
|
@@ -350,8 +345,7 @@ mod tests {
|
|
|
350
345
|
let params = ExtractFileParams {
|
|
351
346
|
path: test_file.to_string(),
|
|
352
347
|
mime_type: None,
|
|
353
|
-
|
|
354
|
-
force_ocr: false,
|
|
348
|
+
config: None,
|
|
355
349
|
r#async: true,
|
|
356
350
|
};
|
|
357
351
|
|
|
@@ -378,8 +372,7 @@ mod tests {
|
|
|
378
372
|
if std::path::Path::new(&file1).exists() && std::path::Path::new(&file2).exists() {
|
|
379
373
|
let params = BatchExtractFilesParams {
|
|
380
374
|
paths: vec![file1.to_string(), file2.to_string()],
|
|
381
|
-
|
|
382
|
-
force_ocr: false,
|
|
375
|
+
config: None,
|
|
383
376
|
r#async: true,
|
|
384
377
|
};
|
|
385
378
|
|
|
@@ -43,9 +43,26 @@ impl DocumentExtractorRegistry {
|
|
|
43
43
|
let priority = extractor.priority();
|
|
44
44
|
let mime_types: Vec<String> = extractor.supported_mime_types().iter().map(|s| s.to_string()).collect();
|
|
45
45
|
|
|
46
|
-
super::validate_plugin_name(&name)
|
|
46
|
+
if let Err(e) = super::validate_plugin_name(&name) {
|
|
47
|
+
tracing::warn!(
|
|
48
|
+
"Failed to validate document extractor name '{}': {}. \
|
|
49
|
+
Registration aborted. Plugin names must be non-empty and contain only alphanumeric characters, hyphens, and underscores.",
|
|
50
|
+
name,
|
|
51
|
+
e
|
|
52
|
+
);
|
|
53
|
+
return Err(e);
|
|
54
|
+
}
|
|
47
55
|
|
|
48
|
-
extractor.initialize()
|
|
56
|
+
if let Err(e) = extractor.initialize() {
|
|
57
|
+
tracing::error!(
|
|
58
|
+
"Failed to initialize document extractor '{}': {}. \
|
|
59
|
+
Extraction for MIME types {:?} will be unavailable.",
|
|
60
|
+
name,
|
|
61
|
+
e,
|
|
62
|
+
mime_types
|
|
63
|
+
);
|
|
64
|
+
return Err(e);
|
|
65
|
+
}
|
|
49
66
|
|
|
50
67
|
let mut index_entries = Vec::new();
|
|
51
68
|
|
|
@@ -57,7 +74,13 @@ impl DocumentExtractorRegistry {
|
|
|
57
74
|
index_entries.push((mime_type.clone(), priority));
|
|
58
75
|
}
|
|
59
76
|
|
|
60
|
-
self.name_index.insert(name, index_entries);
|
|
77
|
+
self.name_index.insert(name.clone(), index_entries);
|
|
78
|
+
tracing::debug!(
|
|
79
|
+
"Registered document extractor '{}' with priority {} for MIME types: {:?}",
|
|
80
|
+
name,
|
|
81
|
+
priority,
|
|
82
|
+
mime_types
|
|
83
|
+
);
|
|
61
84
|
|
|
62
85
|
Ok(())
|
|
63
86
|
}
|
|
@@ -128,7 +151,13 @@ impl DocumentExtractorRegistry {
|
|
|
128
151
|
pub fn remove(&mut self, name: &str) -> Result<()> {
|
|
129
152
|
let index_entries = match self.name_index.remove(name) {
|
|
130
153
|
Some(entries) => entries,
|
|
131
|
-
None =>
|
|
154
|
+
None => {
|
|
155
|
+
tracing::debug!(
|
|
156
|
+
"Document extractor '{}' not found in registry (already removed or never registered)",
|
|
157
|
+
name
|
|
158
|
+
);
|
|
159
|
+
return Ok(());
|
|
160
|
+
}
|
|
132
161
|
};
|
|
133
162
|
|
|
134
163
|
let mut extractor_to_shutdown: Option<Arc<dyn DocumentExtractor>> = None;
|
|
@@ -148,7 +177,16 @@ impl DocumentExtractorRegistry {
|
|
|
148
177
|
}
|
|
149
178
|
|
|
150
179
|
if let Some(extractor) = extractor_to_shutdown {
|
|
151
|
-
extractor.shutdown()
|
|
180
|
+
if let Err(e) = extractor.shutdown() {
|
|
181
|
+
tracing::warn!(
|
|
182
|
+
"Failed to shutdown document extractor '{}': {}. \
|
|
183
|
+
Resources may not have been properly released.",
|
|
184
|
+
name,
|
|
185
|
+
e
|
|
186
|
+
);
|
|
187
|
+
return Err(e);
|
|
188
|
+
}
|
|
189
|
+
tracing::debug!("Successfully removed and shut down document extractor '{}'", name);
|
|
152
190
|
}
|
|
153
191
|
|
|
154
192
|
Ok(())
|
|
@@ -157,9 +195,19 @@ impl DocumentExtractorRegistry {
|
|
|
157
195
|
/// Shutdown all extractors and clear the registry.
|
|
158
196
|
pub fn shutdown_all(&mut self) -> Result<()> {
|
|
159
197
|
let names = self.list();
|
|
198
|
+
let count = names.len();
|
|
199
|
+
|
|
200
|
+
if count > 0 {
|
|
201
|
+
tracing::debug!("Shutting down {} document extractors", count);
|
|
202
|
+
}
|
|
203
|
+
|
|
160
204
|
for name in names {
|
|
161
205
|
self.remove(&name)?;
|
|
162
206
|
}
|
|
207
|
+
|
|
208
|
+
if count > 0 {
|
|
209
|
+
tracing::debug!("Successfully shut down all {} document extractors", count);
|
|
210
|
+
}
|
|
163
211
|
Ok(())
|
|
164
212
|
}
|
|
165
213
|
}
|
|
@@ -413,4 +461,202 @@ mod tests {
|
|
|
413
461
|
assert_eq!(registry.get("text/markdown").unwrap().name(), "multi-extractor");
|
|
414
462
|
assert_eq!(registry.get("text/html").unwrap().name(), "multi-extractor");
|
|
415
463
|
}
|
|
464
|
+
|
|
465
|
+
struct FailingExtractor {
|
|
466
|
+
name: String,
|
|
467
|
+
fail_on_init: bool,
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
impl Plugin for FailingExtractor {
|
|
471
|
+
fn name(&self) -> &str {
|
|
472
|
+
&self.name
|
|
473
|
+
}
|
|
474
|
+
fn version(&self) -> String {
|
|
475
|
+
"1.0.0".to_string()
|
|
476
|
+
}
|
|
477
|
+
fn initialize(&self) -> Result<()> {
|
|
478
|
+
if self.fail_on_init {
|
|
479
|
+
Err(KreuzbergError::Plugin {
|
|
480
|
+
message: "Extractor initialization failed".to_string(),
|
|
481
|
+
plugin_name: self.name.clone(),
|
|
482
|
+
})
|
|
483
|
+
} else {
|
|
484
|
+
Ok(())
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
fn shutdown(&self) -> Result<()> {
|
|
488
|
+
Ok(())
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
#[async_trait]
|
|
493
|
+
impl DocumentExtractor for FailingExtractor {
|
|
494
|
+
async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
495
|
+
Ok(ExtractionResult {
|
|
496
|
+
content: "test".to_string(),
|
|
497
|
+
mime_type: "text/plain".to_string(),
|
|
498
|
+
metadata: crate::types::Metadata::default(),
|
|
499
|
+
tables: vec![],
|
|
500
|
+
detected_languages: None,
|
|
501
|
+
chunks: None,
|
|
502
|
+
images: None,
|
|
503
|
+
djot_content: None,
|
|
504
|
+
pages: None,
|
|
505
|
+
elements: None,
|
|
506
|
+
})
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
fn supported_mime_types(&self) -> &[&str] {
|
|
510
|
+
&["text/plain"]
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
fn priority(&self) -> i32 {
|
|
514
|
+
50
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
#[test]
|
|
519
|
+
fn test_document_extractor_initialization_failure_logs_error() {
|
|
520
|
+
let mut registry = DocumentExtractorRegistry::new();
|
|
521
|
+
|
|
522
|
+
let extractor = Arc::new(FailingExtractor {
|
|
523
|
+
name: "failing-extractor".to_string(),
|
|
524
|
+
fail_on_init: true,
|
|
525
|
+
});
|
|
526
|
+
|
|
527
|
+
let result = registry.register(extractor);
|
|
528
|
+
assert!(result.is_err());
|
|
529
|
+
assert_eq!(registry.list().len(), 0);
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
#[test]
|
|
533
|
+
fn test_document_extractor_invalid_name_empty_logs_warning() {
|
|
534
|
+
let mut registry = DocumentExtractorRegistry::new();
|
|
535
|
+
|
|
536
|
+
let extractor = Arc::new(MockExtractor {
|
|
537
|
+
name: "".to_string(),
|
|
538
|
+
mime_types: &["text/plain"],
|
|
539
|
+
priority: 50,
|
|
540
|
+
});
|
|
541
|
+
|
|
542
|
+
let result = registry.register(extractor);
|
|
543
|
+
assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
#[test]
|
|
547
|
+
fn test_document_extractor_invalid_name_with_spaces_logs_warning() {
|
|
548
|
+
let mut registry = DocumentExtractorRegistry::new();
|
|
549
|
+
|
|
550
|
+
let extractor = Arc::new(MockExtractor {
|
|
551
|
+
name: "invalid extractor".to_string(),
|
|
552
|
+
mime_types: &["text/plain"],
|
|
553
|
+
priority: 50,
|
|
554
|
+
});
|
|
555
|
+
|
|
556
|
+
let result = registry.register(extractor);
|
|
557
|
+
assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
#[test]
|
|
561
|
+
fn test_document_extractor_successful_registration_logs_debug() {
|
|
562
|
+
let mut registry = DocumentExtractorRegistry::new();
|
|
563
|
+
|
|
564
|
+
let extractor = Arc::new(MockExtractor {
|
|
565
|
+
name: "valid-pdf-extractor".to_string(),
|
|
566
|
+
mime_types: &["application/pdf"],
|
|
567
|
+
priority: 100,
|
|
568
|
+
});
|
|
569
|
+
|
|
570
|
+
let result = registry.register(extractor);
|
|
571
|
+
assert!(result.is_ok());
|
|
572
|
+
assert_eq!(registry.list().len(), 1);
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
#[test]
|
|
576
|
+
fn test_document_extractor_remove_nonexistent_logs_debug() {
|
|
577
|
+
let mut registry = DocumentExtractorRegistry::new();
|
|
578
|
+
|
|
579
|
+
let result = registry.remove("nonexistent-extractor");
|
|
580
|
+
assert!(result.is_ok());
|
|
581
|
+
assert_eq!(registry.list().len(), 0);
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
#[test]
|
|
585
|
+
fn test_document_extractor_shutdown_empty_registry() {
|
|
586
|
+
let mut registry = DocumentExtractorRegistry::new();
|
|
587
|
+
let result = registry.shutdown_all();
|
|
588
|
+
assert!(result.is_ok());
|
|
589
|
+
assert_eq!(registry.list().len(), 0);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
#[test]
|
|
593
|
+
fn test_document_extractor_shutdown_with_multiple_extractors() {
|
|
594
|
+
let mut registry = DocumentExtractorRegistry::new();
|
|
595
|
+
|
|
596
|
+
let extractor1 = Arc::new(MockExtractor {
|
|
597
|
+
name: "extractor1".to_string(),
|
|
598
|
+
mime_types: &["text/plain"],
|
|
599
|
+
priority: 50,
|
|
600
|
+
});
|
|
601
|
+
|
|
602
|
+
let extractor2 = Arc::new(MockExtractor {
|
|
603
|
+
name: "extractor2".to_string(),
|
|
604
|
+
mime_types: &["application/pdf"],
|
|
605
|
+
priority: 100,
|
|
606
|
+
});
|
|
607
|
+
|
|
608
|
+
let extractor3 = Arc::new(MockExtractor {
|
|
609
|
+
name: "extractor3".to_string(),
|
|
610
|
+
mime_types: &["image/png"],
|
|
611
|
+
priority: 75,
|
|
612
|
+
});
|
|
613
|
+
|
|
614
|
+
registry.register(extractor1).unwrap();
|
|
615
|
+
registry.register(extractor2).unwrap();
|
|
616
|
+
registry.register(extractor3).unwrap();
|
|
617
|
+
|
|
618
|
+
assert_eq!(registry.list().len(), 3);
|
|
619
|
+
|
|
620
|
+
registry.shutdown_all().unwrap();
|
|
621
|
+
assert_eq!(registry.list().len(), 0);
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
#[test]
|
|
625
|
+
fn test_document_extractor_priority_ordering_complex() {
|
|
626
|
+
let mut registry = DocumentExtractorRegistry::new();
|
|
627
|
+
|
|
628
|
+
let extractors = vec![
|
|
629
|
+
(
|
|
630
|
+
Arc::new(MockExtractor {
|
|
631
|
+
name: "priority-1".to_string(),
|
|
632
|
+
mime_types: &["application/pdf"],
|
|
633
|
+
priority: 1,
|
|
634
|
+
}),
|
|
635
|
+
1,
|
|
636
|
+
),
|
|
637
|
+
(
|
|
638
|
+
Arc::new(MockExtractor {
|
|
639
|
+
name: "priority-100".to_string(),
|
|
640
|
+
mime_types: &["application/pdf"],
|
|
641
|
+
priority: 100,
|
|
642
|
+
}),
|
|
643
|
+
100,
|
|
644
|
+
),
|
|
645
|
+
(
|
|
646
|
+
Arc::new(MockExtractor {
|
|
647
|
+
name: "priority-50".to_string(),
|
|
648
|
+
mime_types: &["application/pdf"],
|
|
649
|
+
priority: 50,
|
|
650
|
+
}),
|
|
651
|
+
50,
|
|
652
|
+
),
|
|
653
|
+
];
|
|
654
|
+
|
|
655
|
+
for (extractor, _priority) in &extractors {
|
|
656
|
+
registry.register(extractor.clone()).unwrap();
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
let retrieved = registry.get("application/pdf").unwrap();
|
|
660
|
+
assert_eq!(retrieved.name(), "priority-100");
|
|
661
|
+
}
|
|
416
662
|
}
|
|
@@ -31,6 +31,8 @@ impl OcrBackendRegistry {
|
|
|
31
31
|
/// Create a new OCR backend registry with default backends.
|
|
32
32
|
///
|
|
33
33
|
/// Registers the Tesseract backend by default if the "ocr" feature is enabled.
|
|
34
|
+
/// Logs warnings if backend initialization fails (common in containerized environments
|
|
35
|
+
/// with missing dependencies or permission issues).
|
|
34
36
|
pub fn new() -> Self {
|
|
35
37
|
#[cfg(feature = "ocr")]
|
|
36
38
|
let mut registry = Self {
|
|
@@ -45,8 +47,27 @@ impl OcrBackendRegistry {
|
|
|
45
47
|
#[cfg(feature = "ocr")]
|
|
46
48
|
{
|
|
47
49
|
use crate::ocr::tesseract_backend::TesseractBackend;
|
|
48
|
-
|
|
49
|
-
|
|
50
|
+
match TesseractBackend::new() {
|
|
51
|
+
Ok(backend) => {
|
|
52
|
+
if let Err(e) = registry.register(Arc::new(backend)) {
|
|
53
|
+
tracing::error!(
|
|
54
|
+
"Failed to register Tesseract OCR backend: {}. \
|
|
55
|
+
OCR functionality will be unavailable. \
|
|
56
|
+
Check TESSDATA_PREFIX environment variable and tessdata file permissions.",
|
|
57
|
+
e
|
|
58
|
+
);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
Err(e) => {
|
|
62
|
+
tracing::warn!(
|
|
63
|
+
"Tesseract OCR backend initialization failed: {}. \
|
|
64
|
+
OCR functionality will be unavailable. \
|
|
65
|
+
Common causes: missing TESSDATA_PREFIX env var, \
|
|
66
|
+
tessdata files not found, or permission issues in containerized environments. \
|
|
67
|
+
See https://docs.kreuzberg.dev/guides/docker/ for Kubernetes troubleshooting.",
|
|
68
|
+
e
|
|
69
|
+
);
|
|
70
|
+
}
|
|
50
71
|
}
|
|
51
72
|
}
|
|
52
73
|
|
|
@@ -290,4 +311,131 @@ mod tests {
|
|
|
290
311
|
registry.shutdown_all().unwrap();
|
|
291
312
|
assert_eq!(registry.list().len(), 0);
|
|
292
313
|
}
|
|
314
|
+
|
|
315
|
+
struct FailingOcrBackend {
|
|
316
|
+
name: String,
|
|
317
|
+
fail_on_init: bool,
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
impl Plugin for FailingOcrBackend {
|
|
321
|
+
fn name(&self) -> &str {
|
|
322
|
+
&self.name
|
|
323
|
+
}
|
|
324
|
+
fn version(&self) -> String {
|
|
325
|
+
"1.0.0".to_string()
|
|
326
|
+
}
|
|
327
|
+
fn initialize(&self) -> Result<()> {
|
|
328
|
+
if self.fail_on_init {
|
|
329
|
+
Err(KreuzbergError::Plugin {
|
|
330
|
+
message: "Backend initialization failed".to_string(),
|
|
331
|
+
plugin_name: self.name.clone(),
|
|
332
|
+
})
|
|
333
|
+
} else {
|
|
334
|
+
Ok(())
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
fn shutdown(&self) -> Result<()> {
|
|
338
|
+
Ok(())
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
#[async_trait]
|
|
343
|
+
impl OcrBackend for FailingOcrBackend {
|
|
344
|
+
async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
|
|
345
|
+
Ok(ExtractionResult {
|
|
346
|
+
content: "test".to_string(),
|
|
347
|
+
mime_type: "text/plain".to_string(),
|
|
348
|
+
metadata: crate::types::Metadata::default(),
|
|
349
|
+
tables: vec![],
|
|
350
|
+
detected_languages: None,
|
|
351
|
+
chunks: None,
|
|
352
|
+
images: None,
|
|
353
|
+
djot_content: None,
|
|
354
|
+
pages: None,
|
|
355
|
+
elements: None,
|
|
356
|
+
})
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
fn supports_language(&self, _lang: &str) -> bool {
|
|
360
|
+
false
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
fn backend_type(&self) -> crate::plugins::ocr::OcrBackendType {
|
|
364
|
+
crate::plugins::ocr::OcrBackendType::Custom
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
#[test]
|
|
369
|
+
fn test_ocr_backend_initialization_failure_logs_error() {
|
|
370
|
+
let mut registry = OcrBackendRegistry::new_empty();
|
|
371
|
+
|
|
372
|
+
let backend = Arc::new(FailingOcrBackend {
|
|
373
|
+
name: "failing-ocr".to_string(),
|
|
374
|
+
fail_on_init: true,
|
|
375
|
+
});
|
|
376
|
+
|
|
377
|
+
let result = registry.register(backend);
|
|
378
|
+
assert!(result.is_err());
|
|
379
|
+
assert_eq!(registry.list().len(), 0);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
#[test]
|
|
383
|
+
fn test_ocr_backend_invalid_name_empty_logs_warning() {
|
|
384
|
+
let mut registry = OcrBackendRegistry::new_empty();
|
|
385
|
+
|
|
386
|
+
let backend = Arc::new(MockOcrBackend {
|
|
387
|
+
name: "".to_string(),
|
|
388
|
+
languages: vec!["eng".to_string()],
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
let result = registry.register(backend);
|
|
392
|
+
assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
#[test]
|
|
396
|
+
fn test_ocr_backend_invalid_name_with_spaces_logs_warning() {
|
|
397
|
+
let mut registry = OcrBackendRegistry::new_empty();
|
|
398
|
+
|
|
399
|
+
let backend = Arc::new(MockOcrBackend {
|
|
400
|
+
name: "invalid ocr backend".to_string(),
|
|
401
|
+
languages: vec!["eng".to_string()],
|
|
402
|
+
});
|
|
403
|
+
|
|
404
|
+
let result = registry.register(backend);
|
|
405
|
+
assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
#[test]
|
|
409
|
+
fn test_ocr_backend_successful_registration_logs_debug() {
|
|
410
|
+
let mut registry = OcrBackendRegistry::new_empty();
|
|
411
|
+
|
|
412
|
+
let backend = Arc::new(MockOcrBackend {
|
|
413
|
+
name: "valid-ocr".to_string(),
|
|
414
|
+
languages: vec!["eng".to_string()],
|
|
415
|
+
});
|
|
416
|
+
|
|
417
|
+
let result = registry.register(backend);
|
|
418
|
+
assert!(result.is_ok());
|
|
419
|
+
assert_eq!(registry.list().len(), 1);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
#[test]
|
|
423
|
+
fn test_ocr_backend_multiple_registrations() {
|
|
424
|
+
let mut registry = OcrBackendRegistry::new_empty();
|
|
425
|
+
|
|
426
|
+
let backend1 = Arc::new(MockOcrBackend {
|
|
427
|
+
name: "ocr-backend-1".to_string(),
|
|
428
|
+
languages: vec!["eng".to_string()],
|
|
429
|
+
});
|
|
430
|
+
|
|
431
|
+
let backend2 = Arc::new(MockOcrBackend {
|
|
432
|
+
name: "ocr-backend-2".to_string(),
|
|
433
|
+
languages: vec!["deu".to_string()],
|
|
434
|
+
});
|
|
435
|
+
|
|
436
|
+
registry.register(backend1).unwrap();
|
|
437
|
+
registry.register(backend2).unwrap();
|
|
438
|
+
|
|
439
|
+
assert_eq!(registry.list().len(), 2);
|
|
440
|
+
}
|
|
293
441
|
}
|