kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -6,6 +6,7 @@
6
6
  use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
7
7
  use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
8
8
  use async_trait::async_trait;
9
+ use std::borrow::Cow;
9
10
 
10
11
  /// Post-processor that extracts keywords from document content.
11
12
  ///
@@ -65,7 +66,7 @@ impl PostProcessor for KeywordExtractor {
65
66
  result
66
67
  .metadata
67
68
  .additional
68
- .insert("keywords".to_string(), serde_json::to_value(&keywords)?);
69
+ .insert(Cow::Borrowed("keywords"), serde_json::to_value(&keywords)?);
69
70
 
70
71
  Ok(())
71
72
  }
@@ -107,7 +108,7 @@ machine learning that uses neural networks with multiple layers.
107
108
 
108
109
  let mut result = ExtractionResult {
109
110
  content: TEST_TEXT.to_string(),
110
- mime_type: "text/plain".to_string(),
111
+ mime_type: Cow::Borrowed("text/plain"),
111
112
  metadata: Metadata::default(),
112
113
  tables: vec![],
113
114
  detected_languages: None,
@@ -138,7 +139,7 @@ machine learning that uses neural networks with multiple layers.
138
139
 
139
140
  let mut result = ExtractionResult {
140
141
  content: TEST_TEXT.to_string(),
141
- mime_type: "text/plain".to_string(),
142
+ mime_type: Cow::Borrowed("text/plain"),
142
143
  metadata: Metadata::default(),
143
144
  tables: vec![],
144
145
  detected_languages: None,
@@ -165,7 +166,7 @@ machine learning that uses neural networks with multiple layers.
165
166
 
166
167
  let mut result = ExtractionResult {
167
168
  content: TEST_TEXT.to_string(),
168
- mime_type: "text/plain".to_string(),
169
+ mime_type: Cow::Borrowed("text/plain"),
169
170
  metadata: Metadata::default(),
170
171
  tables: vec![],
171
172
  detected_languages: None,
@@ -192,7 +193,7 @@ machine learning that uses neural networks with multiple layers.
192
193
 
193
194
  let mut result = ExtractionResult {
194
195
  content: "Short text".to_string(),
195
- mime_type: "text/plain".to_string(),
196
+ mime_type: Cow::Borrowed("text/plain"),
196
197
  metadata: Metadata::default(),
197
198
  tables: vec![],
198
199
  detected_languages: None,
@@ -230,7 +231,7 @@ machine learning that uses neural networks with multiple layers.
230
231
 
231
232
  let result = ExtractionResult {
232
233
  content: TEST_TEXT.to_string(),
233
- mime_type: "text/plain".to_string(),
234
+ mime_type: Cow::Borrowed("text/plain"),
234
235
  metadata: Metadata::default(),
235
236
  tables: vec![],
236
237
  detected_languages: None,
@@ -257,7 +258,7 @@ machine learning that uses neural networks with multiple layers.
257
258
 
258
259
  let short_result = ExtractionResult {
259
260
  content: "Short text with just a few words".to_string(),
260
- mime_type: "text/plain".to_string(),
261
+ mime_type: Cow::Borrowed("text/plain"),
261
262
  metadata: Metadata::default(),
262
263
  tables: vec![],
263
264
  detected_languages: None,
@@ -270,7 +271,7 @@ machine learning that uses neural networks with multiple layers.
270
271
 
271
272
  let long_result = ExtractionResult {
272
273
  content: "word ".repeat(1000),
273
- mime_type: "text/plain".to_string(),
274
+ mime_type: Cow::Borrowed("text/plain"),
274
275
  metadata: Metadata::default(),
275
276
  tables: vec![],
276
277
  detected_languages: None,
@@ -87,6 +87,7 @@ mod tests {
87
87
  use super::*;
88
88
  use crate::core::config::LanguageDetectionConfig;
89
89
  use crate::types::Metadata;
90
+ use std::borrow::Cow;
90
91
 
91
92
  #[tokio::test]
92
93
  async fn test_language_detector_processor() {
@@ -102,7 +103,7 @@ mod tests {
102
103
 
103
104
  let mut result = ExtractionResult {
104
105
  content: "Hello world! This is a test of the language detection system.".to_string(),
105
- mime_type: "text/plain".to_string(),
106
+ mime_type: Cow::Borrowed("text/plain"),
106
107
  metadata: Metadata::default(),
107
108
  tables: vec![],
108
109
  detected_languages: None,
@@ -128,7 +129,7 @@ mod tests {
128
129
 
129
130
  let mut result = ExtractionResult {
130
131
  content: "Hello world!".to_string(),
131
- mime_type: "text/plain".to_string(),
132
+ mime_type: Cow::Borrowed("text/plain"),
132
133
  metadata: Metadata::default(),
133
134
  tables: vec![],
134
135
  detected_languages: None,
@@ -165,7 +166,7 @@ mod tests {
165
166
 
166
167
  let result = ExtractionResult {
167
168
  content: "Sample text".to_string(),
168
- mime_type: "text/plain".to_string(),
169
+ mime_type: Cow::Borrowed("text/plain"),
169
170
  metadata: Metadata::default(),
170
171
  tables: vec![],
171
172
  detected_languages: None,
@@ -196,7 +197,7 @@ mod tests {
196
197
 
197
198
  let short_result = ExtractionResult {
198
199
  content: "Short".to_string(),
199
- mime_type: "text/plain".to_string(),
200
+ mime_type: Cow::Borrowed("text/plain"),
200
201
  metadata: Metadata::default(),
201
202
  tables: vec![],
202
203
  detected_languages: None,
@@ -209,7 +210,7 @@ mod tests {
209
210
 
210
211
  let long_result = ExtractionResult {
211
212
  content: "a".repeat(10000),
212
- mime_type: "text/plain".to_string(),
213
+ mime_type: Cow::Borrowed("text/plain"),
213
214
  metadata: Metadata::default(),
214
215
  tables: vec![],
215
216
  detected_languages: None,
@@ -88,7 +88,7 @@ pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
88
88
  pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
89
89
 
90
90
  pub use core::config::{
91
- ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
91
+ ChunkerType, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
92
92
  LanguageDetectionConfig, OcrConfig, OutputFormat, PageConfig, PostProcessorConfig, TokenReductionConfig,
93
93
  };
94
94
 
@@ -4,6 +4,7 @@
4
4
 
5
5
  use crate::KreuzbergError;
6
6
  use rmcp::ErrorData as McpError;
7
+ use std::fmt::Write;
7
8
 
8
9
  /// Map Kreuzberg errors to MCP error responses with appropriate error codes.
9
10
  ///
@@ -21,7 +22,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
21
22
  KreuzbergError::Validation { message, source } => {
22
23
  let mut error_message = format!("Validation error: {}", message);
23
24
  if let Some(src) = source {
24
- error_message.push_str(&format!(" (caused by: {})", src));
25
+ let _ = write!(error_message, " (caused by: {})", src);
25
26
  }
26
27
  McpError::invalid_params(error_message, None)
27
28
  }
@@ -41,7 +42,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
41
42
  KreuzbergError::Parsing { message, source } => {
42
43
  let mut error_message = format!("Parsing error: {}", message);
43
44
  if let Some(src) = source {
44
- error_message.push_str(&format!(" (caused by: {})", src));
45
+ let _ = write!(error_message, " (caused by: {})", src);
45
46
  }
46
47
  McpError::parse_error(error_message, None)
47
48
  }
@@ -52,7 +53,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
52
53
  KreuzbergError::Ocr { message, source } => {
53
54
  let mut error_message = format!("OCR processing error: {}", message);
54
55
  if let Some(src) = source {
55
- error_message.push_str(&format!(" (caused by: {})", src));
56
+ let _ = write!(error_message, " (caused by: {})", src);
56
57
  }
57
58
  McpError::internal_error(error_message, None)
58
59
  }
@@ -60,7 +61,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
60
61
  KreuzbergError::Cache { message, source } => {
61
62
  let mut error_message = format!("Cache error: {}", message);
62
63
  if let Some(src) = source {
63
- error_message.push_str(&format!(" (caused by: {})", src));
64
+ let _ = write!(error_message, " (caused by: {})", src);
64
65
  }
65
66
  McpError::internal_error(error_message, None)
66
67
  }
@@ -68,7 +69,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
68
69
  KreuzbergError::ImageProcessing { message, source } => {
69
70
  let mut error_message = format!("Image processing error: {}", message);
70
71
  if let Some(src) = source {
71
- error_message.push_str(&format!(" (caused by: {})", src));
72
+ let _ = write!(error_message, " (caused by: {})", src);
72
73
  }
73
74
  McpError::internal_error(error_message, None)
74
75
  }
@@ -76,7 +77,7 @@ pub fn map_kreuzberg_error_to_mcp(error: KreuzbergError) -> McpError {
76
77
  KreuzbergError::Serialization { message, source } => {
77
78
  let mut error_message = format!("Serialization error: {}", message);
78
79
  if let Some(src) = source {
79
- error_message.push_str(&format!(" (caused by: {})", src));
80
+ let _ = write!(error_message, " (caused by: {})", src);
80
81
  }
81
82
  McpError::internal_error(error_message, None)
82
83
  }
@@ -83,6 +83,7 @@ pub(super) fn format_extraction_result(result: &KreuzbergResult) -> String {
83
83
  #[cfg(test)]
84
84
  mod tests {
85
85
  use super::*;
86
+ use std::borrow::Cow;
86
87
 
87
88
  #[test]
88
89
  fn test_build_config_with_no_config() {
@@ -290,7 +291,7 @@ mod tests {
290
291
  fn test_format_extraction_result_is_valid_json() {
291
292
  let result = KreuzbergResult {
292
293
  content: "Sample extracted text".to_string(),
293
- mime_type: "text/plain".to_string(),
294
+ mime_type: Cow::Borrowed("text/plain"),
294
295
  metadata: crate::Metadata::default(),
295
296
  tables: vec![],
296
297
  detected_languages: None,
@@ -313,7 +314,7 @@ mod tests {
313
314
  fn test_format_extraction_result_includes_tables() {
314
315
  let result = KreuzbergResult {
315
316
  content: "Document with tables".to_string(),
316
- mime_type: "application/pdf".to_string(),
317
+ mime_type: Cow::Borrowed("application/pdf"),
317
318
  metadata: crate::Metadata::default(),
318
319
  tables: vec![crate::Table {
319
320
  cells: vec![
@@ -342,7 +343,7 @@ mod tests {
342
343
  fn test_format_extraction_result_includes_chunks_when_present() {
343
344
  let result = KreuzbergResult {
344
345
  content: "Chunked text".to_string(),
345
- mime_type: "text/plain".to_string(),
346
+ mime_type: Cow::Borrowed("text/plain"),
346
347
  metadata: crate::Metadata::default(),
347
348
  tables: vec![],
348
349
  detected_languages: None,
@@ -376,7 +377,7 @@ mod tests {
376
377
  fn test_format_extraction_result_omits_none_fields() {
377
378
  let result = KreuzbergResult {
378
379
  content: "Simple text".to_string(),
379
- mime_type: "text/plain".to_string(),
380
+ mime_type: Cow::Borrowed("text/plain"),
380
381
  metadata: crate::Metadata::default(),
381
382
  tables: vec![],
382
383
  detected_languages: None,
@@ -1,6 +1,7 @@
1
1
  //! Document extraction MCP tools.
2
2
 
3
3
  use base64::prelude::*;
4
+ use std::borrow::Cow;
4
5
  use crate::{
5
6
  ExtractionConfig, batch_extract_file, batch_extract_file_sync, extract_bytes, extract_bytes_sync, extract_file,
6
7
  extract_file_sync, mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
@@ -219,7 +220,7 @@ mod tests {
219
220
  let server = TestMcpServer::new();
220
221
  let params = ExtractFileParams {
221
222
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
222
- mime_type: Some("application/pdf".to_string()),
223
+ mime_type: Some(Cow::Borrowed("application/pdf")),
223
224
  config: None,
224
225
  r#async: true,
225
226
  };
@@ -238,7 +239,7 @@ mod tests {
238
239
 
239
240
  let params = ExtractBytesParams {
240
241
  data: encoded,
241
- mime_type: Some("text/plain".to_string()),
242
+ mime_type: Some(Cow::Borrowed("text/plain")),
242
243
  config: None,
243
244
  r#async: true,
244
245
  };
@@ -215,12 +215,14 @@ mod tests {
215
215
 
216
216
  #[test]
217
217
  fn test_hocr_large_document() {
218
+ use std::fmt::Write;
218
219
  let mut hocr = String::from(r#"<div class="ocr_page">"#);
219
220
  for i in 0..100 {
220
- hocr.push_str(&format!(
221
+ let _ = write!(
222
+ hocr,
221
223
  r#"<p class="ocr_par"><span class="ocrx_word">Word{}</span></p>"#,
222
224
  i
223
- ));
225
+ );
224
226
  }
225
227
  hocr.push_str("</div>");
226
228
 
@@ -4,7 +4,9 @@
4
4
  //! text extraction, and result formatting.
5
5
 
6
6
  use super::config::{apply_tesseract_variables, hash_config};
7
- use super::validation::{resolve_tessdata_path, strip_control_characters, validate_language_and_traineddata};
7
+ use super::validation::{
8
+ resolve_all_installed_languages, resolve_tessdata_path, strip_control_characters, validate_language_and_traineddata,
9
+ };
8
10
  use crate::core::config::ExtractionConfig;
9
11
  use crate::ocr::cache::OcrCache;
10
12
  use crate::ocr::error::OcrError;
@@ -323,8 +325,34 @@ pub(super) fn process_file_with_cache(
323
325
  process_image_with_cache(&image_bytes, config, cache, output_format)
324
326
  }
325
327
 
328
+ /// Check if a language value is the "all" wildcard (case-insensitive).
329
+ fn is_all_languages(lang: &str) -> bool {
330
+ let lower = lang.to_ascii_lowercase();
331
+ lower == "all" || lower == "*"
332
+ }
333
+
334
+ /// Resolve the "all"/"*" wildcard in a config's language field.
335
+ ///
336
+ /// If the language is a wildcard, scans the tessdata directory for installed
337
+ /// languages and returns a new config with the resolved language string.
338
+ /// Otherwise returns `None`, indicating the original config should be used as-is.
339
+ fn resolve_config_language(config: &TesseractConfig) -> Result<Option<TesseractConfig>, OcrError> {
340
+ if is_all_languages(&config.language) {
341
+ let tessdata_path = resolve_tessdata_path();
342
+ let resolved = resolve_all_installed_languages(&tessdata_path)?;
343
+ let mut resolved_config = config.clone();
344
+ resolved_config.language = resolved;
345
+ Ok(Some(resolved_config))
346
+ } else {
347
+ Ok(None)
348
+ }
349
+ }
350
+
326
351
  /// Process an image and return OCR results, using cache if enabled.
327
352
  ///
353
+ /// Resolves the `"all"` / `"*"` language wildcard, then delegates to
354
+ /// [`process_image_resolved`] for caching and OCR execution.
355
+ ///
328
356
  /// # Arguments
329
357
  ///
330
358
  /// * `image_bytes` - Raw image data
@@ -343,6 +371,25 @@ pub(super) fn process_image_with_cache(
343
371
  ) -> Result<OcrExtractionResult, OcrError> {
344
372
  config.validate().map_err(OcrError::InvalidConfiguration)?;
345
373
 
374
+ // Resolve "all" / "*" before hashing so cache keys reflect actual languages.
375
+ // If not a wildcard, resolved is None and we use the original config (no clone).
376
+ let resolved = resolve_config_language(config)?;
377
+ let config = resolved.as_ref().unwrap_or(config);
378
+
379
+ process_image_resolved(image_bytes, config, cache, output_format)
380
+ }
381
+
382
+ /// Inner implementation operating on an already-resolved config.
383
+ ///
384
+ /// Handles cache lookup, OCR execution, and cache storage. Callers are
385
+ /// responsible for validating and resolving wildcards in the config before
386
+ /// calling this function.
387
+ fn process_image_resolved(
388
+ image_bytes: &[u8],
389
+ config: &TesseractConfig,
390
+ cache: &OcrCache,
391
+ output_format: Option<crate::core::config::OutputFormat>,
392
+ ) -> Result<OcrExtractionResult, OcrError> {
346
393
  let mut hasher = ahash::AHasher::default();
347
394
  use std::hash::{Hash, Hasher};
348
395
  image_bytes.hash(&mut hasher);
@@ -378,7 +425,10 @@ pub(super) fn process_image_with_cache(
378
425
 
379
426
  /// Process multiple image files in parallel using Rayon.
380
427
  ///
381
- /// This method processes OCR operations in parallel across CPU cores for improved throughput.
428
+ /// Validates and resolves the language wildcard once, then processes all files
429
+ /// in parallel using [`process_image_resolved`] directly (skipping redundant
430
+ /// per-image resolution).
431
+ ///
382
432
  /// Results are returned in the same order as the input file paths.
383
433
  pub(super) fn process_files_batch(
384
434
  file_paths: Vec<String>,
@@ -387,21 +437,64 @@ pub(super) fn process_files_batch(
387
437
  ) -> Vec<BatchItemResult> {
388
438
  use rayon::prelude::*;
389
439
 
390
- file_paths
391
- .par_iter()
392
- .map(|path| match process_file_with_cache(path, config, cache, None) {
393
- Ok(result) => BatchItemResult {
394
- file_path: path.clone(),
395
- success: true,
396
- result: Some(result),
397
- error: None,
398
- },
399
- Err(e) => BatchItemResult {
400
- file_path: path.clone(),
440
+ // Validate once for the entire batch.
441
+ if let Err(e) = config.validate().map_err(OcrError::InvalidConfiguration) {
442
+ return file_paths
443
+ .into_iter()
444
+ .map(|path| BatchItemResult {
445
+ file_path: path,
401
446
  success: false,
402
447
  result: None,
403
448
  error: Some(e.to_string()),
404
- },
449
+ })
450
+ .collect();
451
+ }
452
+
453
+ // Resolve "all" / "*" once for the entire batch.
454
+ let resolved = match resolve_config_language(config) {
455
+ Ok(r) => r,
456
+ Err(e) => {
457
+ return file_paths
458
+ .into_iter()
459
+ .map(|path| BatchItemResult {
460
+ file_path: path,
461
+ success: false,
462
+ result: None,
463
+ error: Some(e.to_string()),
464
+ })
465
+ .collect();
466
+ }
467
+ };
468
+ let config = resolved.as_ref().unwrap_or(config);
469
+
470
+ file_paths
471
+ .par_iter()
472
+ .map(|path| {
473
+ let image_bytes = match std::fs::read(path) {
474
+ Ok(b) => b,
475
+ Err(e) => {
476
+ return BatchItemResult {
477
+ file_path: path.clone(),
478
+ success: false,
479
+ result: None,
480
+ error: Some(OcrError::IOError(format!("Failed to read file '{}': {}", path, e)).to_string()),
481
+ };
482
+ }
483
+ };
484
+ match process_image_resolved(&image_bytes, config, cache, None) {
485
+ Ok(result) => BatchItemResult {
486
+ file_path: path.clone(),
487
+ success: true,
488
+ result: Some(result),
489
+ error: None,
490
+ },
491
+ Err(e) => BatchItemResult {
492
+ file_path: path.clone(),
493
+ success: false,
494
+ result: None,
495
+ error: Some(e.to_string()),
496
+ },
497
+ }
405
498
  })
406
499
  .collect()
407
500
  }
@@ -411,6 +504,27 @@ mod tests {
411
504
  use super::*;
412
505
  use tempfile::tempdir;
413
506
 
507
+ #[test]
508
+ fn test_is_all_languages() {
509
+ assert!(is_all_languages("all"));
510
+ assert!(is_all_languages("ALL"));
511
+ assert!(is_all_languages("All"));
512
+ assert!(is_all_languages("*"));
513
+ assert!(!is_all_languages("eng"));
514
+ assert!(!is_all_languages("eng+fra"));
515
+ assert!(!is_all_languages(""));
516
+ }
517
+
518
+ #[test]
519
+ fn test_resolve_config_language_passthrough() {
520
+ let config = TesseractConfig {
521
+ language: "eng".to_string(),
522
+ ..TesseractConfig::default()
523
+ };
524
+ let resolved = resolve_config_language(&config).unwrap();
525
+ assert!(resolved.is_none(), "non-wildcard should return None (no clone)");
526
+ }
527
+
414
528
  #[test]
415
529
  fn test_compute_image_hash_deterministic() {
416
530
  use ahash::AHasher;
@@ -4,6 +4,7 @@
4
4
  //! before OCR processing begins.
5
5
 
6
6
  use crate::ocr::error::OcrError;
7
+ use crate::ocr::validation::TESSERACT_SUPPORTED_LANGUAGE_CODES;
7
8
  use std::env;
8
9
  use std::path::Path;
9
10
 
@@ -83,6 +84,71 @@ pub(super) fn resolve_tessdata_path() -> String {
83
84
  .unwrap_or_default()
84
85
  }
85
86
 
87
+ /// Resolve all installed Tesseract languages from the tessdata directory.
88
+ ///
89
+ /// Scans the tessdata directory for `*.traineddata` files, filters against
90
+ /// known Tesseract language codes (excluding non-language files like `osd`),
91
+ /// and returns a `+`-separated language string (e.g., `"eng+fra+deu"`).
92
+ ///
93
+ /// # Arguments
94
+ ///
95
+ /// * `tessdata_path` - Path to the tessdata directory
96
+ ///
97
+ /// # Returns
98
+ ///
99
+ /// A `+`-separated string of installed language codes, or an error if no languages are found.
100
+ pub(super) fn resolve_all_installed_languages(tessdata_path: &str) -> Result<String, OcrError> {
101
+ if tessdata_path.is_empty() {
102
+ return Err(OcrError::TesseractInitializationFailed(
103
+ "Cannot resolve installed languages: tessdata path is empty. \
104
+ Set TESSDATA_PREFIX or install Tesseract with language data."
105
+ .to_string(),
106
+ ));
107
+ }
108
+
109
+ let tessdata_dir = Path::new(tessdata_path);
110
+ if !tessdata_dir.exists() {
111
+ return Err(OcrError::TesseractInitializationFailed(format!(
112
+ "Tessdata directory does not exist: {}",
113
+ tessdata_path
114
+ )));
115
+ }
116
+
117
+ let entries = std::fs::read_dir(tessdata_dir).map_err(|e| {
118
+ OcrError::TesseractInitializationFailed(format!("Failed to read tessdata directory '{}': {}", tessdata_path, e))
119
+ })?;
120
+
121
+ // Non-language traineddata files to exclude (special-purpose data, not OCR languages)
122
+ const EXCLUDED: &[&str] = &["osd", "equ"];
123
+
124
+ let mut languages: Vec<String> = entries
125
+ .filter_map(|entry| entry.ok())
126
+ .filter_map(|entry| {
127
+ let path = entry.path();
128
+ let file_name = path.file_name()?.to_str()?;
129
+ let lang = file_name.strip_suffix(".traineddata")?;
130
+ if EXCLUDED.contains(&lang) {
131
+ return None;
132
+ }
133
+ if TESSERACT_SUPPORTED_LANGUAGE_CODES.contains(lang) {
134
+ Some(lang.to_string())
135
+ } else {
136
+ None
137
+ }
138
+ })
139
+ .collect();
140
+
141
+ if languages.is_empty() {
142
+ return Err(OcrError::TesseractInitializationFailed(format!(
143
+ "No installed Tesseract languages found in '{}'",
144
+ tessdata_path
145
+ )));
146
+ }
147
+
148
+ languages.sort();
149
+ Ok(languages.join("+"))
150
+ }
151
+
86
152
  /// Strip control characters from text, preserving whitespace.
87
153
  ///
88
154
  /// Removes control characters (0x00-0x1F, 0x7F) except for newlines, carriage returns, and tabs.
@@ -111,6 +177,69 @@ pub(super) fn strip_control_characters(text: &str) -> String {
111
177
  mod tests {
112
178
  use super::*;
113
179
 
180
+ #[test]
181
+ fn test_resolve_all_installed_languages_success() {
182
+ let dir = tempfile::tempdir().unwrap();
183
+ let tessdata = dir.path();
184
+
185
+ // Create mock traineddata files
186
+ std::fs::write(tessdata.join("eng.traineddata"), b"").unwrap();
187
+ std::fs::write(tessdata.join("fra.traineddata"), b"").unwrap();
188
+ std::fs::write(tessdata.join("deu.traineddata"), b"").unwrap();
189
+
190
+ let result = resolve_all_installed_languages(tessdata.to_str().unwrap()).unwrap();
191
+ assert_eq!(result, "deu+eng+fra");
192
+ }
193
+
194
+ #[test]
195
+ fn test_resolve_all_installed_languages_excludes_osd() {
196
+ let dir = tempfile::tempdir().unwrap();
197
+ let tessdata = dir.path();
198
+
199
+ std::fs::write(tessdata.join("eng.traineddata"), b"").unwrap();
200
+ std::fs::write(tessdata.join("osd.traineddata"), b"").unwrap();
201
+
202
+ let result = resolve_all_installed_languages(tessdata.to_str().unwrap()).unwrap();
203
+ assert_eq!(result, "eng");
204
+ }
205
+
206
+ #[test]
207
+ fn test_resolve_all_installed_languages_excludes_equ() {
208
+ let dir = tempfile::tempdir().unwrap();
209
+ let tessdata = dir.path();
210
+
211
+ std::fs::write(tessdata.join("eng.traineddata"), b"").unwrap();
212
+ std::fs::write(tessdata.join("equ.traineddata"), b"").unwrap();
213
+
214
+ let result = resolve_all_installed_languages(tessdata.to_str().unwrap()).unwrap();
215
+ assert_eq!(result, "eng");
216
+ }
217
+
218
+ #[test]
219
+ fn test_resolve_all_installed_languages_excludes_unknown() {
220
+ let dir = tempfile::tempdir().unwrap();
221
+ let tessdata = dir.path();
222
+
223
+ std::fs::write(tessdata.join("eng.traineddata"), b"").unwrap();
224
+ std::fs::write(tessdata.join("notareal.traineddata"), b"").unwrap();
225
+
226
+ let result = resolve_all_installed_languages(tessdata.to_str().unwrap()).unwrap();
227
+ assert_eq!(result, "eng");
228
+ }
229
+
230
+ #[test]
231
+ fn test_resolve_all_installed_languages_empty_dir() {
232
+ let dir = tempfile::tempdir().unwrap();
233
+ let result = resolve_all_installed_languages(dir.path().to_str().unwrap());
234
+ assert!(result.is_err());
235
+ }
236
+
237
+ #[test]
238
+ fn test_resolve_all_installed_languages_empty_path() {
239
+ let result = resolve_all_installed_languages("");
240
+ assert!(result.is_err());
241
+ }
242
+
114
243
  #[test]
115
244
  fn test_strip_control_characters() {
116
245
  let input = "Hello\x00World\x01Test";