kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
7
7
|
use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
|
|
8
8
|
use async_trait::async_trait;
|
|
9
|
+
use std::borrow::Cow;
|
|
9
10
|
|
|
10
11
|
/// Post-processor that extracts keywords from document content.
|
|
11
12
|
///
|
|
@@ -65,7 +66,7 @@ impl PostProcessor for KeywordExtractor {
|
|
|
65
66
|
result
|
|
66
67
|
.metadata
|
|
67
68
|
.additional
|
|
68
|
-
.insert("keywords"
|
|
69
|
+
.insert(Cow::Borrowed("keywords"), serde_json::to_value(&keywords)?);
|
|
69
70
|
|
|
70
71
|
Ok(())
|
|
71
72
|
}
|
|
@@ -107,7 +108,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
107
108
|
|
|
108
109
|
let mut result = ExtractionResult {
|
|
109
110
|
content: TEST_TEXT.to_string(),
|
|
110
|
-
mime_type: "text/plain"
|
|
111
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
111
112
|
metadata: Metadata::default(),
|
|
112
113
|
tables: vec![],
|
|
113
114
|
detected_languages: None,
|
|
@@ -138,7 +139,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
138
139
|
|
|
139
140
|
let mut result = ExtractionResult {
|
|
140
141
|
content: TEST_TEXT.to_string(),
|
|
141
|
-
mime_type: "text/plain"
|
|
142
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
142
143
|
metadata: Metadata::default(),
|
|
143
144
|
tables: vec![],
|
|
144
145
|
detected_languages: None,
|
|
@@ -165,7 +166,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
165
166
|
|
|
166
167
|
let mut result = ExtractionResult {
|
|
167
168
|
content: TEST_TEXT.to_string(),
|
|
168
|
-
mime_type: "text/plain"
|
|
169
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
169
170
|
metadata: Metadata::default(),
|
|
170
171
|
tables: vec![],
|
|
171
172
|
detected_languages: None,
|
|
@@ -192,7 +193,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
192
193
|
|
|
193
194
|
let mut result = ExtractionResult {
|
|
194
195
|
content: "Short text".to_string(),
|
|
195
|
-
mime_type: "text/plain"
|
|
196
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
196
197
|
metadata: Metadata::default(),
|
|
197
198
|
tables: vec![],
|
|
198
199
|
detected_languages: None,
|
|
@@ -230,7 +231,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
230
231
|
|
|
231
232
|
let result = ExtractionResult {
|
|
232
233
|
content: TEST_TEXT.to_string(),
|
|
233
|
-
mime_type: "text/plain"
|
|
234
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
234
235
|
metadata: Metadata::default(),
|
|
235
236
|
tables: vec![],
|
|
236
237
|
detected_languages: None,
|
|
@@ -257,7 +258,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
257
258
|
|
|
258
259
|
let short_result = ExtractionResult {
|
|
259
260
|
content: "Short text with just a few words".to_string(),
|
|
260
|
-
mime_type: "text/plain"
|
|
261
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
261
262
|
metadata: Metadata::default(),
|
|
262
263
|
tables: vec![],
|
|
263
264
|
detected_languages: None,
|
|
@@ -270,7 +271,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
270
271
|
|
|
271
272
|
let long_result = ExtractionResult {
|
|
272
273
|
content: "word ".repeat(1000),
|
|
273
|
-
mime_type: "text/plain"
|
|
274
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
274
275
|
metadata: Metadata::default(),
|
|
275
276
|
tables: vec![],
|
|
276
277
|
detected_languages: None,
|
|
@@ -87,6 +87,7 @@ mod tests {
|
|
|
87
87
|
use super::*;
|
|
88
88
|
use crate::core::config::LanguageDetectionConfig;
|
|
89
89
|
use crate::types::Metadata;
|
|
90
|
+
use std::borrow::Cow;
|
|
90
91
|
|
|
91
92
|
#[tokio::test]
|
|
92
93
|
async fn test_language_detector_processor() {
|
|
@@ -102,7 +103,7 @@ mod tests {
|
|
|
102
103
|
|
|
103
104
|
let mut result = ExtractionResult {
|
|
104
105
|
content: "Hello world! This is a test of the language detection system.".to_string(),
|
|
105
|
-
mime_type: "text/plain"
|
|
106
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
106
107
|
metadata: Metadata::default(),
|
|
107
108
|
tables: vec![],
|
|
108
109
|
detected_languages: None,
|
|
@@ -128,7 +129,7 @@ mod tests {
|
|
|
128
129
|
|
|
129
130
|
let mut result = ExtractionResult {
|
|
130
131
|
content: "Hello world!".to_string(),
|
|
131
|
-
mime_type: "text/plain"
|
|
132
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
132
133
|
metadata: Metadata::default(),
|
|
133
134
|
tables: vec![],
|
|
134
135
|
detected_languages: None,
|
|
@@ -165,7 +166,7 @@ mod tests {
|
|
|
165
166
|
|
|
166
167
|
let result = ExtractionResult {
|
|
167
168
|
content: "Sample text".to_string(),
|
|
168
|
-
mime_type: "text/plain"
|
|
169
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
169
170
|
metadata: Metadata::default(),
|
|
170
171
|
tables: vec![],
|
|
171
172
|
detected_languages: None,
|
|
@@ -196,7 +197,7 @@ mod tests {
|
|
|
196
197
|
|
|
197
198
|
let short_result = ExtractionResult {
|
|
198
199
|
content: "Short".to_string(),
|
|
199
|
-
mime_type: "text/plain"
|
|
200
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
200
201
|
metadata: Metadata::default(),
|
|
201
202
|
tables: vec![],
|
|
202
203
|
detected_languages: None,
|
|
@@ -209,7 +210,7 @@ mod tests {
|
|
|
209
210
|
|
|
210
211
|
let long_result = ExtractionResult {
|
|
211
212
|
content: "a".repeat(10000),
|
|
212
|
-
mime_type: "text/plain"
|
|
213
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
213
214
|
metadata: Metadata::default(),
|
|
214
215
|
tables: vec![],
|
|
215
216
|
detected_languages: None,
|
data/vendor/kreuzberg/src/lib.rs
CHANGED
|
@@ -88,7 +88,7 @@ pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
|
|
|
88
88
|
pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
|
|
89
89
|
|
|
90
90
|
pub use core::config::{
|
|
91
|
-
ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
|
|
91
|
+
ChunkerType, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
|
|
92
92
|
LanguageDetectionConfig, OcrConfig, OutputFormat, PageConfig, PostProcessorConfig, TokenReductionConfig,
|
|
93
93
|
};
|
|
94
94
|
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::KreuzbergError;
|
|
6
6
|
use rmcp::ErrorData as McpError;
|
|
7
|
+
use std::fmt::Write;
|
|
7
8
|
|
|
8
9
|
/// Map Kreuzberg errors to MCP error responses with appropriate error codes.
|
|
9
10
|
///
|
|
@@ -21,7 +22,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
|
|
|
21
22
|
KreuzbergError::Validation { message, source } => {
|
|
22
23
|
let mut error_message = format!("Validation error: {}", message);
|
|
23
24
|
if let Some(src) = source {
|
|
24
|
-
|
|
25
|
+
let _ = write!(error_message, " (caused by: {})", src);
|
|
25
26
|
}
|
|
26
27
|
McpError::invalid_params(error_message, None)
|
|
27
28
|
}
|
|
@@ -41,7 +42,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
|
|
|
41
42
|
KreuzbergError::Parsing { message, source } => {
|
|
42
43
|
let mut error_message = format!("Parsing error: {}", message);
|
|
43
44
|
if let Some(src) = source {
|
|
44
|
-
|
|
45
|
+
let _ = write!(error_message, " (caused by: {})", src);
|
|
45
46
|
}
|
|
46
47
|
McpError::parse_error(error_message, None)
|
|
47
48
|
}
|
|
@@ -52,7 +53,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
|
|
|
52
53
|
KreuzbergError::Ocr { message, source } => {
|
|
53
54
|
let mut error_message = format!("OCR processing error: {}", message);
|
|
54
55
|
if let Some(src) = source {
|
|
55
|
-
|
|
56
|
+
let _ = write!(error_message, " (caused by: {})", src);
|
|
56
57
|
}
|
|
57
58
|
McpError::internal_error(error_message, None)
|
|
58
59
|
}
|
|
@@ -60,7 +61,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
|
|
|
60
61
|
KreuzbergError::Cache { message, source } => {
|
|
61
62
|
let mut error_message = format!("Cache error: {}", message);
|
|
62
63
|
if let Some(src) = source {
|
|
63
|
-
|
|
64
|
+
let _ = write!(error_message, " (caused by: {})", src);
|
|
64
65
|
}
|
|
65
66
|
McpError::internal_error(error_message, None)
|
|
66
67
|
}
|
|
@@ -68,7 +69,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
|
|
|
68
69
|
KreuzbergError::ImageProcessing { message, source } => {
|
|
69
70
|
let mut error_message = format!("Image processing error: {}", message);
|
|
70
71
|
if let Some(src) = source {
|
|
71
|
-
|
|
72
|
+
let _ = write!(error_message, " (caused by: {})", src);
|
|
72
73
|
}
|
|
73
74
|
McpError::internal_error(error_message, None)
|
|
74
75
|
}
|
|
@@ -76,7 +77,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
|
|
|
76
77
|
KreuzbergError::Serialization { message, source } => {
|
|
77
78
|
let mut error_message = format!("Serialization error: {}", message);
|
|
78
79
|
if let Some(src) = source {
|
|
79
|
-
|
|
80
|
+
let _ = write!(error_message, " (caused by: {})", src);
|
|
80
81
|
}
|
|
81
82
|
McpError::internal_error(error_message, None)
|
|
82
83
|
}
|
|
@@ -83,6 +83,7 @@ pub(super) fn format_extraction_result(result: &KreuzbergResult) -> String {
|
|
|
83
83
|
#[cfg(test)]
|
|
84
84
|
mod tests {
|
|
85
85
|
use super::*;
|
|
86
|
+
use std::borrow::Cow;
|
|
86
87
|
|
|
87
88
|
#[test]
|
|
88
89
|
fn test_build_config_with_no_config() {
|
|
@@ -290,7 +291,7 @@ mod tests {
|
|
|
290
291
|
fn test_format_extraction_result_is_valid_json() {
|
|
291
292
|
let result = KreuzbergResult {
|
|
292
293
|
content: "Sample extracted text".to_string(),
|
|
293
|
-
mime_type: "text/plain"
|
|
294
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
294
295
|
metadata: crate::Metadata::default(),
|
|
295
296
|
tables: vec![],
|
|
296
297
|
detected_languages: None,
|
|
@@ -313,7 +314,7 @@ mod tests {
|
|
|
313
314
|
fn test_format_extraction_result_includes_tables() {
|
|
314
315
|
let result = KreuzbergResult {
|
|
315
316
|
content: "Document with tables".to_string(),
|
|
316
|
-
mime_type: "application/pdf"
|
|
317
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
317
318
|
metadata: crate::Metadata::default(),
|
|
318
319
|
tables: vec![crate::Table {
|
|
319
320
|
cells: vec![
|
|
@@ -342,7 +343,7 @@ mod tests {
|
|
|
342
343
|
fn test_format_extraction_result_includes_chunks_when_present() {
|
|
343
344
|
let result = KreuzbergResult {
|
|
344
345
|
content: "Chunked text".to_string(),
|
|
345
|
-
mime_type: "text/plain"
|
|
346
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
346
347
|
metadata: crate::Metadata::default(),
|
|
347
348
|
tables: vec![],
|
|
348
349
|
detected_languages: None,
|
|
@@ -376,7 +377,7 @@ mod tests {
|
|
|
376
377
|
fn test_format_extraction_result_omits_none_fields() {
|
|
377
378
|
let result = KreuzbergResult {
|
|
378
379
|
content: "Simple text".to_string(),
|
|
379
|
-
mime_type: "text/plain"
|
|
380
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
380
381
|
metadata: crate::Metadata::default(),
|
|
381
382
|
tables: vec![],
|
|
382
383
|
detected_languages: None,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
//! Document extraction MCP tools.
|
|
2
2
|
|
|
3
3
|
use base64::prelude::*;
|
|
4
|
+
use std::borrow::Cow;
|
|
4
5
|
use crate::{
|
|
5
6
|
ExtractionConfig, batch_extract_file, batch_extract_file_sync, extract_bytes, extract_bytes_sync, extract_file,
|
|
6
7
|
extract_file_sync, mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
|
|
@@ -219,7 +220,7 @@ mod tests {
|
|
|
219
220
|
let server = TestMcpServer::new();
|
|
220
221
|
let params = ExtractFileParams {
|
|
221
222
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
222
|
-
mime_type: Some("application/pdf"
|
|
223
|
+
mime_type: Some(Cow::Borrowed("application/pdf")),
|
|
223
224
|
config: None,
|
|
224
225
|
r#async: true,
|
|
225
226
|
};
|
|
@@ -238,7 +239,7 @@ mod tests {
|
|
|
238
239
|
|
|
239
240
|
let params = ExtractBytesParams {
|
|
240
241
|
data: encoded,
|
|
241
|
-
mime_type: Some("text/plain"
|
|
242
|
+
mime_type: Some(Cow::Borrowed("text/plain")),
|
|
242
243
|
config: None,
|
|
243
244
|
r#async: true,
|
|
244
245
|
};
|
|
@@ -215,12 +215,14 @@ mod tests {
|
|
|
215
215
|
|
|
216
216
|
#[test]
|
|
217
217
|
fn test_hocr_large_document() {
|
|
218
|
+
use std::fmt::Write;
|
|
218
219
|
let mut hocr = String::from(r#"<div class="ocr_page">"#);
|
|
219
220
|
for i in 0..100 {
|
|
220
|
-
|
|
221
|
+
let _ = write!(
|
|
222
|
+
hocr,
|
|
221
223
|
r#"<p class="ocr_par"><span class="ocrx_word">Word{}</span></p>"#,
|
|
222
224
|
i
|
|
223
|
-
)
|
|
225
|
+
);
|
|
224
226
|
}
|
|
225
227
|
hocr.push_str("</div>");
|
|
226
228
|
|
|
@@ -4,7 +4,9 @@
|
|
|
4
4
|
//! text extraction, and result formatting.
|
|
5
5
|
|
|
6
6
|
use super::config::{apply_tesseract_variables, hash_config};
|
|
7
|
-
use super::validation::{
|
|
7
|
+
use super::validation::{
|
|
8
|
+
resolve_all_installed_languages, resolve_tessdata_path, strip_control_characters, validate_language_and_traineddata,
|
|
9
|
+
};
|
|
8
10
|
use crate::core::config::ExtractionConfig;
|
|
9
11
|
use crate::ocr::cache::OcrCache;
|
|
10
12
|
use crate::ocr::error::OcrError;
|
|
@@ -323,8 +325,34 @@ pub(super) fn process_file_with_cache(
|
|
|
323
325
|
process_image_with_cache(&image_bytes, config, cache, output_format)
|
|
324
326
|
}
|
|
325
327
|
|
|
328
|
+
/// Check if a language value is the "all" wildcard (case-insensitive).
|
|
329
|
+
fn is_all_languages(lang: &str) -> bool {
|
|
330
|
+
let lower = lang.to_ascii_lowercase();
|
|
331
|
+
lower == "all" || lower == "*"
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
/// Resolve the "all"/"*" wildcard in a config's language field.
|
|
335
|
+
///
|
|
336
|
+
/// If the language is a wildcard, scans the tessdata directory for installed
|
|
337
|
+
/// languages and returns a new config with the resolved language string.
|
|
338
|
+
/// Otherwise returns `None`, indicating the original config should be used as-is.
|
|
339
|
+
fn resolve_config_language(config: &TesseractConfig) -> Result<Option<TesseractConfig>, OcrError> {
|
|
340
|
+
if is_all_languages(&config.language) {
|
|
341
|
+
let tessdata_path = resolve_tessdata_path();
|
|
342
|
+
let resolved = resolve_all_installed_languages(&tessdata_path)?;
|
|
343
|
+
let mut resolved_config = config.clone();
|
|
344
|
+
resolved_config.language = resolved;
|
|
345
|
+
Ok(Some(resolved_config))
|
|
346
|
+
} else {
|
|
347
|
+
Ok(None)
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
|
|
326
351
|
/// Process an image and return OCR results, using cache if enabled.
|
|
327
352
|
///
|
|
353
|
+
/// Resolves the `"all"` / `"*"` language wildcard, then delegates to
|
|
354
|
+
/// [`process_image_resolved`] for caching and OCR execution.
|
|
355
|
+
///
|
|
328
356
|
/// # Arguments
|
|
329
357
|
///
|
|
330
358
|
/// * `image_bytes` - Raw image data
|
|
@@ -343,6 +371,25 @@ pub(super) fn process_image_with_cache(
|
|
|
343
371
|
) -> Result<OcrExtractionResult, OcrError> {
|
|
344
372
|
config.validate().map_err(OcrError::InvalidConfiguration)?;
|
|
345
373
|
|
|
374
|
+
// Resolve "all" / "*" before hashing so cache keys reflect actual languages.
|
|
375
|
+
// If not a wildcard, resolved is None and we use the original config (no clone).
|
|
376
|
+
let resolved = resolve_config_language(config)?;
|
|
377
|
+
let config = resolved.as_ref().unwrap_or(config);
|
|
378
|
+
|
|
379
|
+
process_image_resolved(image_bytes, config, cache, output_format)
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
/// Inner implementation operating on an already-resolved config.
|
|
383
|
+
///
|
|
384
|
+
/// Handles cache lookup, OCR execution, and cache storage. Callers are
|
|
385
|
+
/// responsible for validating and resolving wildcards in the config before
|
|
386
|
+
/// calling this function.
|
|
387
|
+
fn process_image_resolved(
|
|
388
|
+
image_bytes: &[u8],
|
|
389
|
+
config: &TesseractConfig,
|
|
390
|
+
cache: &OcrCache,
|
|
391
|
+
output_format: Option<crate::core::config::OutputFormat>,
|
|
392
|
+
) -> Result<OcrExtractionResult, OcrError> {
|
|
346
393
|
let mut hasher = ahash::AHasher::default();
|
|
347
394
|
use std::hash::{Hash, Hasher};
|
|
348
395
|
image_bytes.hash(&mut hasher);
|
|
@@ -378,7 +425,10 @@ pub(super) fn process_image_with_cache(
|
|
|
378
425
|
|
|
379
426
|
/// Process multiple image files in parallel using Rayon.
|
|
380
427
|
///
|
|
381
|
-
///
|
|
428
|
+
/// Validates and resolves the language wildcard once, then processes all files
|
|
429
|
+
/// in parallel using [`process_image_resolved`] directly (skipping redundant
|
|
430
|
+
/// per-image resolution).
|
|
431
|
+
///
|
|
382
432
|
/// Results are returned in the same order as the input file paths.
|
|
383
433
|
pub(super) fn process_files_batch(
|
|
384
434
|
file_paths: Vec<String>,
|
|
@@ -387,21 +437,64 @@ pub(super) fn process_files_batch(
|
|
|
387
437
|
) -> Vec<BatchItemResult> {
|
|
388
438
|
use rayon::prelude::*;
|
|
389
439
|
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
result: Some(result),
|
|
397
|
-
error: None,
|
|
398
|
-
},
|
|
399
|
-
Err(e) => BatchItemResult {
|
|
400
|
-
file_path: path.clone(),
|
|
440
|
+
// Validate once for the entire batch.
|
|
441
|
+
if let Err(e) = config.validate().map_err(OcrError::InvalidConfiguration) {
|
|
442
|
+
return file_paths
|
|
443
|
+
.into_iter()
|
|
444
|
+
.map(|path| BatchItemResult {
|
|
445
|
+
file_path: path,
|
|
401
446
|
success: false,
|
|
402
447
|
result: None,
|
|
403
448
|
error: Some(e.to_string()),
|
|
404
|
-
}
|
|
449
|
+
})
|
|
450
|
+
.collect();
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Resolve "all" / "*" once for the entire batch.
|
|
454
|
+
let resolved = match resolve_config_language(config) {
|
|
455
|
+
Ok(r) => r,
|
|
456
|
+
Err(e) => {
|
|
457
|
+
return file_paths
|
|
458
|
+
.into_iter()
|
|
459
|
+
.map(|path| BatchItemResult {
|
|
460
|
+
file_path: path,
|
|
461
|
+
success: false,
|
|
462
|
+
result: None,
|
|
463
|
+
error: Some(e.to_string()),
|
|
464
|
+
})
|
|
465
|
+
.collect();
|
|
466
|
+
}
|
|
467
|
+
};
|
|
468
|
+
let config = resolved.as_ref().unwrap_or(config);
|
|
469
|
+
|
|
470
|
+
file_paths
|
|
471
|
+
.par_iter()
|
|
472
|
+
.map(|path| {
|
|
473
|
+
let image_bytes = match std::fs::read(path) {
|
|
474
|
+
Ok(b) => b,
|
|
475
|
+
Err(e) => {
|
|
476
|
+
return BatchItemResult {
|
|
477
|
+
file_path: path.clone(),
|
|
478
|
+
success: false,
|
|
479
|
+
result: None,
|
|
480
|
+
error: Some(OcrError::IOError(format!("Failed to read file '{}': {}", path, e)).to_string()),
|
|
481
|
+
};
|
|
482
|
+
}
|
|
483
|
+
};
|
|
484
|
+
match process_image_resolved(&image_bytes, config, cache, None) {
|
|
485
|
+
Ok(result) => BatchItemResult {
|
|
486
|
+
file_path: path.clone(),
|
|
487
|
+
success: true,
|
|
488
|
+
result: Some(result),
|
|
489
|
+
error: None,
|
|
490
|
+
},
|
|
491
|
+
Err(e) => BatchItemResult {
|
|
492
|
+
file_path: path.clone(),
|
|
493
|
+
success: false,
|
|
494
|
+
result: None,
|
|
495
|
+
error: Some(e.to_string()),
|
|
496
|
+
},
|
|
497
|
+
}
|
|
405
498
|
})
|
|
406
499
|
.collect()
|
|
407
500
|
}
|
|
@@ -411,6 +504,27 @@ mod tests {
|
|
|
411
504
|
use super::*;
|
|
412
505
|
use tempfile::tempdir;
|
|
413
506
|
|
|
507
|
+
#[test]
|
|
508
|
+
fn test_is_all_languages() {
|
|
509
|
+
assert!(is_all_languages("all"));
|
|
510
|
+
assert!(is_all_languages("ALL"));
|
|
511
|
+
assert!(is_all_languages("All"));
|
|
512
|
+
assert!(is_all_languages("*"));
|
|
513
|
+
assert!(!is_all_languages("eng"));
|
|
514
|
+
assert!(!is_all_languages("eng+fra"));
|
|
515
|
+
assert!(!is_all_languages(""));
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
#[test]
|
|
519
|
+
fn test_resolve_config_language_passthrough() {
|
|
520
|
+
let config = TesseractConfig {
|
|
521
|
+
language: "eng".to_string(),
|
|
522
|
+
..TesseractConfig::default()
|
|
523
|
+
};
|
|
524
|
+
let resolved = resolve_config_language(&config).unwrap();
|
|
525
|
+
assert!(resolved.is_none(), "non-wildcard should return None (no clone)");
|
|
526
|
+
}
|
|
527
|
+
|
|
414
528
|
#[test]
|
|
415
529
|
fn test_compute_image_hash_deterministic() {
|
|
416
530
|
use ahash::AHasher;
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
//! before OCR processing begins.
|
|
5
5
|
|
|
6
6
|
use crate::ocr::error::OcrError;
|
|
7
|
+
use crate::ocr::validation::TESSERACT_SUPPORTED_LANGUAGE_CODES;
|
|
7
8
|
use std::env;
|
|
8
9
|
use std::path::Path;
|
|
9
10
|
|
|
@@ -83,6 +84,71 @@ pub(super) fn resolve_tessdata_path() -> String {
|
|
|
83
84
|
.unwrap_or_default()
|
|
84
85
|
}
|
|
85
86
|
|
|
87
|
+
/// Resolve all installed Tesseract languages from the tessdata directory.
|
|
88
|
+
///
|
|
89
|
+
/// Scans the tessdata directory for `*.traineddata` files, filters against
|
|
90
|
+
/// known Tesseract language codes (excluding non-language files like `osd`),
|
|
91
|
+
/// and returns a `+`-separated language string (e.g., `"eng+fra+deu"`).
|
|
92
|
+
///
|
|
93
|
+
/// # Arguments
|
|
94
|
+
///
|
|
95
|
+
/// * `tessdata_path` - Path to the tessdata directory
|
|
96
|
+
///
|
|
97
|
+
/// # Returns
|
|
98
|
+
///
|
|
99
|
+
/// A `+`-separated string of installed language codes, or an error if no languages are found.
|
|
100
|
+
pub(super) fn resolve_all_installed_languages(tessdata_path: &str) -> Result<String, OcrError> {
|
|
101
|
+
if tessdata_path.is_empty() {
|
|
102
|
+
return Err(OcrError::TesseractInitializationFailed(
|
|
103
|
+
"Cannot resolve installed languages: tessdata path is empty. \
|
|
104
|
+
Set TESSDATA_PREFIX or install Tesseract with language data."
|
|
105
|
+
.to_string(),
|
|
106
|
+
));
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
let tessdata_dir = Path::new(tessdata_path);
|
|
110
|
+
if !tessdata_dir.exists() {
|
|
111
|
+
return Err(OcrError::TesseractInitializationFailed(format!(
|
|
112
|
+
"Tessdata directory does not exist: {}",
|
|
113
|
+
tessdata_path
|
|
114
|
+
)));
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
let entries = std::fs::read_dir(tessdata_dir).map_err(|e| {
|
|
118
|
+
OcrError::TesseractInitializationFailed(format!("Failed to read tessdata directory '{}': {}", tessdata_path, e))
|
|
119
|
+
})?;
|
|
120
|
+
|
|
121
|
+
// Non-language traineddata files to exclude (special-purpose data, not OCR languages)
|
|
122
|
+
const EXCLUDED: &[&str] = &["osd", "equ"];
|
|
123
|
+
|
|
124
|
+
let mut languages: Vec<String> = entries
|
|
125
|
+
.filter_map(|entry| entry.ok())
|
|
126
|
+
.filter_map(|entry| {
|
|
127
|
+
let path = entry.path();
|
|
128
|
+
let file_name = path.file_name()?.to_str()?;
|
|
129
|
+
let lang = file_name.strip_suffix(".traineddata")?;
|
|
130
|
+
if EXCLUDED.contains(&lang) {
|
|
131
|
+
return None;
|
|
132
|
+
}
|
|
133
|
+
if TESSERACT_SUPPORTED_LANGUAGE_CODES.contains(lang) {
|
|
134
|
+
Some(lang.to_string())
|
|
135
|
+
} else {
|
|
136
|
+
None
|
|
137
|
+
}
|
|
138
|
+
})
|
|
139
|
+
.collect();
|
|
140
|
+
|
|
141
|
+
if languages.is_empty() {
|
|
142
|
+
return Err(OcrError::TesseractInitializationFailed(format!(
|
|
143
|
+
"No installed Tesseract languages found in '{}'",
|
|
144
|
+
tessdata_path
|
|
145
|
+
)));
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
languages.sort();
|
|
149
|
+
Ok(languages.join("+"))
|
|
150
|
+
}
|
|
151
|
+
|
|
86
152
|
/// Strip control characters from text, preserving whitespace.
|
|
87
153
|
///
|
|
88
154
|
/// Removes control characters (0x00-0x1F, 0x7F) except for newlines, carriage returns, and tabs.
|
|
@@ -111,6 +177,69 @@ pub(super) fn strip_control_characters(text: &str) -> String {
|
|
|
111
177
|
mod tests {
|
|
112
178
|
use super::*;
|
|
113
179
|
|
|
180
|
+
#[test]
|
|
181
|
+
fn test_resolve_all_installed_languages_success() {
|
|
182
|
+
let dir = tempfile::tempdir().unwrap();
|
|
183
|
+
let tessdata = dir.path();
|
|
184
|
+
|
|
185
|
+
// Create mock traineddata files
|
|
186
|
+
std::fs::write(tessdata.join("eng.traineddata"), b"").unwrap();
|
|
187
|
+
std::fs::write(tessdata.join("fra.traineddata"), b"").unwrap();
|
|
188
|
+
std::fs::write(tessdata.join("deu.traineddata"), b"").unwrap();
|
|
189
|
+
|
|
190
|
+
let result = resolve_all_installed_languages(tessdata.to_str().unwrap()).unwrap();
|
|
191
|
+
assert_eq!(result, "deu+eng+fra");
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
#[test]
|
|
195
|
+
fn test_resolve_all_installed_languages_excludes_osd() {
|
|
196
|
+
let dir = tempfile::tempdir().unwrap();
|
|
197
|
+
let tessdata = dir.path();
|
|
198
|
+
|
|
199
|
+
std::fs::write(tessdata.join("eng.traineddata"), b"").unwrap();
|
|
200
|
+
std::fs::write(tessdata.join("osd.traineddata"), b"").unwrap();
|
|
201
|
+
|
|
202
|
+
let result = resolve_all_installed_languages(tessdata.to_str().unwrap()).unwrap();
|
|
203
|
+
assert_eq!(result, "eng");
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
#[test]
|
|
207
|
+
fn test_resolve_all_installed_languages_excludes_equ() {
|
|
208
|
+
let dir = tempfile::tempdir().unwrap();
|
|
209
|
+
let tessdata = dir.path();
|
|
210
|
+
|
|
211
|
+
std::fs::write(tessdata.join("eng.traineddata"), b"").unwrap();
|
|
212
|
+
std::fs::write(tessdata.join("equ.traineddata"), b"").unwrap();
|
|
213
|
+
|
|
214
|
+
let result = resolve_all_installed_languages(tessdata.to_str().unwrap()).unwrap();
|
|
215
|
+
assert_eq!(result, "eng");
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
#[test]
|
|
219
|
+
fn test_resolve_all_installed_languages_excludes_unknown() {
|
|
220
|
+
let dir = tempfile::tempdir().unwrap();
|
|
221
|
+
let tessdata = dir.path();
|
|
222
|
+
|
|
223
|
+
std::fs::write(tessdata.join("eng.traineddata"), b"").unwrap();
|
|
224
|
+
std::fs::write(tessdata.join("notareal.traineddata"), b"").unwrap();
|
|
225
|
+
|
|
226
|
+
let result = resolve_all_installed_languages(tessdata.to_str().unwrap()).unwrap();
|
|
227
|
+
assert_eq!(result, "eng");
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
#[test]
|
|
231
|
+
fn test_resolve_all_installed_languages_empty_dir() {
|
|
232
|
+
let dir = tempfile::tempdir().unwrap();
|
|
233
|
+
let result = resolve_all_installed_languages(dir.path().to_str().unwrap());
|
|
234
|
+
assert!(result.is_err());
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
#[test]
|
|
238
|
+
fn test_resolve_all_installed_languages_empty_path() {
|
|
239
|
+
let result = resolve_all_installed_languages("");
|
|
240
|
+
assert!(result.is_err());
|
|
241
|
+
}
|
|
242
|
+
|
|
114
243
|
#[test]
|
|
115
244
|
fn test_strip_control_characters() {
|
|
116
245
|
let input = "Hello\x00World\x01Test";
|