kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -8,7 +8,9 @@ use crate::core::config::OcrConfig;
8
8
  use crate::ocr::processor::OcrProcessor;
9
9
  use crate::plugins::{OcrBackend, OcrBackendType, Plugin};
10
10
  use crate::types::ExtractionResult;
11
+ use ahash::AHashMap;
11
12
  use async_trait::async_trait;
13
+ use std::borrow::Cow;
12
14
  use std::path::Path;
13
15
  use std::sync::{Arc, OnceLock};
14
16
 
@@ -196,9 +198,23 @@ impl OcrBackend for TesseractBackend {
196
198
  source: Some(Box::new(e)),
197
199
  })?;
198
200
 
201
+ // Use resolved language from OCR result metadata (handles "all"/"*" resolution)
202
+ let resolved_language = ocr_result
203
+ .metadata
204
+ .get("language")
205
+ .and_then(|v| v.as_str())
206
+ .unwrap_or(&tess_config.language)
207
+ .to_string();
208
+
209
+ // Convert HashMap<String, Value> to AHashMap<Cow<'static, str>, Value>
210
+ let mut additional = AHashMap::new();
211
+ for (key, value) in ocr_result.metadata {
212
+ additional.insert(Cow::Owned(key), value);
213
+ }
214
+
199
215
  let metadata = crate::types::Metadata {
200
216
  format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
201
- language: tess_config.language.clone(),
217
+ language: resolved_language,
202
218
  psm: tess_config.psm as i32,
203
219
  output_format: tess_config.output_format.clone(),
204
220
  table_count: ocr_result.tables.len(),
@@ -208,13 +224,13 @@ impl OcrBackend for TesseractBackend {
208
224
  .first()
209
225
  .and_then(|t| t.cells.first().map(|row| row.len())),
210
226
  })),
211
- additional: ocr_result.metadata,
227
+ additional,
212
228
  ..Default::default()
213
229
  };
214
230
 
215
231
  Ok(ExtractionResult {
216
232
  content: ocr_result.content,
217
- mime_type: ocr_result.mime_type,
233
+ mime_type: ocr_result.mime_type.into(),
218
234
  metadata,
219
235
  pages: None,
220
236
  tables: ocr_result
@@ -256,9 +272,23 @@ impl OcrBackend for TesseractBackend {
256
272
  source: Some(Box::new(e)),
257
273
  })?;
258
274
 
275
+ // Use resolved language from OCR result metadata (handles "all"/"*" resolution)
276
+ let resolved_language = ocr_result
277
+ .metadata
278
+ .get("language")
279
+ .and_then(|v| v.as_str())
280
+ .unwrap_or(&tess_config.language)
281
+ .to_string();
282
+
283
+ // Convert HashMap<String, Value> to AHashMap<Cow<'static, str>, Value>
284
+ let mut additional = AHashMap::new();
285
+ for (key, value) in ocr_result.metadata {
286
+ additional.insert(Cow::Owned(key), value);
287
+ }
288
+
259
289
  let metadata = crate::types::Metadata {
260
290
  format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
261
- language: tess_config.language.clone(),
291
+ language: resolved_language,
262
292
  psm: tess_config.psm as i32,
263
293
  output_format: tess_config.output_format.clone(),
264
294
  table_count: ocr_result.tables.len(),
@@ -268,13 +298,13 @@ impl OcrBackend for TesseractBackend {
268
298
  .first()
269
299
  .and_then(|t| t.cells.first().map(|row| row.len())),
270
300
  })),
271
- additional: ocr_result.metadata,
301
+ additional,
272
302
  ..Default::default()
273
303
  };
274
304
 
275
305
  Ok(ExtractionResult {
276
306
  content: ocr_result.content,
277
- mime_type: ocr_result.mime_type,
307
+ mime_type: ocr_result.mime_type.into(),
278
308
  metadata,
279
309
  pages: None,
280
310
  tables: ocr_result
@@ -1,5 +1,4 @@
1
1
  use serde::{Deserialize, Serialize};
2
- use std::collections::HashMap;
3
2
 
4
3
  pub use crate::types::ImagePreprocessingConfig;
5
4
 
@@ -154,7 +153,7 @@ impl From<&crate::types::TesseractConfig> for TesseractConfig {
154
153
  pub struct ExtractionResult {
155
154
  pub content: String,
156
155
  pub mime_type: String,
157
- pub metadata: HashMap<String, serde_json::Value>,
156
+ pub metadata: std::collections::HashMap<String, serde_json::Value>,
158
157
  pub tables: Vec<Table>,
159
158
  }
160
159
 
@@ -260,7 +259,7 @@ mod tests {
260
259
 
261
260
  #[test]
262
261
  fn test_extraction_result_creation() {
263
- let mut metadata = HashMap::new();
262
+ let mut metadata = std::collections::HashMap::new();
264
263
  metadata.insert("key".to_string(), serde_json::json!("value"));
265
264
 
266
265
  let table = Table {
@@ -308,7 +307,7 @@ mod tests {
308
307
  let result = crate::types::OcrExtractionResult {
309
308
  content: "content".to_string(),
310
309
  mime_type: "text/plain".to_string(),
311
- metadata: HashMap::new(),
310
+ metadata: std::collections::HashMap::new(),
312
311
  tables: vec![],
313
312
  };
314
313
 
@@ -131,6 +131,12 @@ lazy_static::lazy_static! {
131
131
  }
132
132
 
133
133
  pub fn validate_language_code(lang_code: &str) -> Result<(), OcrError> {
134
+ // Accept "all" and "*" as special values to auto-detect installed languages
135
+ let lower = lang_code.to_ascii_lowercase();
136
+ if lower == "all" || lower == "*" {
137
+ return Ok(());
138
+ }
139
+
134
140
  for code in lang_code.split('+') {
135
141
  if !TESSERACT_SUPPORTED_LANGUAGE_CODES.contains(code) {
136
142
  return Err(OcrError::InvalidLanguageCode(format!(
@@ -156,6 +162,14 @@ pub fn validate_tesseract_version(version: u32) -> Result<(), OcrError> {
156
162
  mod tests {
157
163
  use super::*;
158
164
 
165
+ #[test]
166
+ fn test_validate_language_code_all_keyword() {
167
+ assert!(validate_language_code("all").is_ok());
168
+ assert!(validate_language_code("*").is_ok());
169
+ assert!(validate_language_code("ALL").is_ok());
170
+ assert!(validate_language_code("All").is_ok());
171
+ }
172
+
159
173
  #[test]
160
174
  fn test_validate_language_code_valid() {
161
175
  assert!(validate_language_code("eng").is_ok());
@@ -10,6 +10,7 @@ use serde::{Deserialize, Serialize};
10
10
  /// `Metadata` structure. Common fields like title, authors, keywords, and dates
11
11
  /// are now at the `Metadata` level.
12
12
  #[derive(Debug, Clone, Serialize, Deserialize, Default)]
13
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
13
14
  pub struct PdfMetadata {
14
15
  /// PDF version (e.g., "1.7", "2.0")
15
16
  #[serde(skip_serializing_if = "Option::is_none")]
@@ -19,6 +19,7 @@ mod tests {
19
19
  use crate::plugins::Plugin;
20
20
  use crate::types::ExtractionResult;
21
21
  use async_trait::async_trait;
22
+ use std::borrow::Cow;
22
23
 
23
24
  struct MockExtractor {
24
25
  mime_types: Vec<&'static str>,
@@ -53,7 +54,7 @@ mod tests {
53
54
  ) -> Result<ExtractionResult> {
54
55
  Ok(ExtractionResult {
55
56
  content: String::from_utf8_lossy(content).to_string(),
56
- mime_type: mime_type.to_string(),
57
+ mime_type: mime_type.to_string().into(),
57
58
  metadata: crate::types::Metadata::default(),
58
59
  tables: vec![],
59
60
  detected_languages: None,
@@ -228,7 +229,7 @@ mod tests {
228
229
  ) -> Result<ExtractionResult> {
229
230
  Ok(ExtractionResult {
230
231
  content: String::new(),
231
- mime_type: String::new(),
232
+ mime_type: Cow::Borrowed(""),
232
233
  metadata: crate::types::Metadata::default(),
233
234
  tables: vec![],
234
235
  detected_languages: None,
@@ -50,7 +50,7 @@ use std::sync::Arc;
50
50
  /// -> Result<ExtractionResult> {
51
51
  /// Ok(ExtractionResult {
52
52
  /// content: String::from_utf8_lossy(content).to_string(),
53
- /// mime_type: mime_type.to_string(),
53
+ /// mime_type: mime_type.to_string().into(),
54
54
  /// metadata: Metadata::default(),
55
55
  /// tables: vec![],
56
56
  /// detected_languages: None,
@@ -189,6 +189,7 @@ mod tests {
189
189
  use crate::types::ExtractionResult;
190
190
  use async_trait::async_trait;
191
191
  use serial_test::serial;
192
+ use std::borrow::Cow;
192
193
 
193
194
  struct MockExtractor {
194
195
  mime_types: Vec<&'static str>,
@@ -223,7 +224,7 @@ mod tests {
223
224
  ) -> Result<ExtractionResult> {
224
225
  Ok(ExtractionResult {
225
226
  content: String::from_utf8_lossy(content).to_string(),
226
- mime_type: mime_type.to_string(),
227
+ mime_type: mime_type.to_string().into(),
227
228
  metadata: crate::types::Metadata::default(),
228
229
  tables: vec![],
229
230
  detected_languages: None,
@@ -362,7 +363,7 @@ mod tests {
362
363
  async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
363
364
  Ok(ExtractionResult {
364
365
  content: String::new(),
365
- mime_type: String::new(),
366
+ mime_type: Cow::Borrowed(""),
366
367
  metadata: crate::types::Metadata::default(),
367
368
  tables: vec![],
368
369
  detected_languages: None,
@@ -410,7 +411,7 @@ mod tests {
410
411
  async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
411
412
  Ok(ExtractionResult {
412
413
  content: String::new(),
413
- mime_type: String::new(),
414
+ mime_type: Cow::Borrowed(""),
414
415
  metadata: crate::types::Metadata::default(),
415
416
  tables: vec![],
416
417
  detected_languages: None,
@@ -61,7 +61,7 @@ pub enum OcrBackendType {
61
61
  /// // Implement OCR logic here
62
62
  /// Ok(ExtractionResult {
63
63
  /// content: "Extracted text".to_string(),
64
- /// mime_type: "text/plain".to_string(),
64
+ /// mime_type: Cow::Borrowed("text/plain"),
65
65
  /// metadata: Metadata::default(),
66
66
  /// tables: vec![],
67
67
  /// detected_languages: None,
@@ -142,7 +142,7 @@ pub trait OcrBackend: Plugin {
142
142
  ///
143
143
  /// Ok(ExtractionResult {
144
144
  /// content: text,
145
- /// mime_type: "text/plain".to_string(),
145
+ /// mime_type: Cow::Borrowed("text/plain"),
146
146
  /// metadata: Metadata::default(),
147
147
  /// tables: vec![],
148
148
  /// detected_languages: None,
@@ -315,7 +315,7 @@ pub trait OcrBackend: Plugin {
315
315
  /// async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
316
316
  /// Ok(ExtractionResult {
317
317
  /// content: "text".to_string(),
318
- /// mime_type: "text/plain".to_string(),
318
+ /// mime_type: Cow::Borrowed("text/plain"),
319
319
  /// metadata: Metadata::default(),
320
320
  /// tables: vec![],
321
321
  /// detected_languages: None,
@@ -450,6 +450,7 @@ pub fn clear_ocr_backends() -> crate::Result<()> {
450
450
  #[cfg(test)]
451
451
  mod tests {
452
452
  use super::*;
453
+ use std::borrow::Cow;
453
454
 
454
455
  struct MockOcrBackend {
455
456
  languages: Vec<String>,
@@ -478,7 +479,7 @@ mod tests {
478
479
  async fn process_image(&self, _image_bytes: &[u8], _config: &OcrConfig) -> Result<ExtractionResult> {
479
480
  Ok(ExtractionResult {
480
481
  content: "Mocked OCR text".to_string(),
481
- mime_type: "text/plain".to_string(),
482
+ mime_type: Cow::Borrowed("text/plain"),
482
483
  metadata: crate::types::Metadata::default(),
483
484
  tables: vec![],
484
485
  detected_languages: None,
@@ -18,8 +18,9 @@ mod tests {
18
18
  use crate::core::config::ExtractionConfig;
19
19
  use crate::plugins::Plugin;
20
20
  use crate::types::ExtractionResult;
21
+ use ahash::AHashMap;
21
22
  use async_trait::async_trait;
22
- use std::collections::HashMap;
23
+ use std::borrow::Cow;
23
24
 
24
25
  struct MockPostProcessor {
25
26
  stage: ProcessingStage,
@@ -49,7 +50,7 @@ mod tests {
49
50
  result
50
51
  .metadata
51
52
  .additional
52
- .insert("processed_by".to_string(), serde_json::json!(self.name()));
53
+ .insert(Cow::Borrowed("processed_by"), serde_json::json!(self.name()));
53
54
  Ok(())
54
55
  }
55
56
 
@@ -66,7 +67,7 @@ mod tests {
66
67
 
67
68
  let mut result = ExtractionResult {
68
69
  content: "test content".to_string(),
69
- mime_type: "text/plain".to_string(),
70
+ mime_type: Cow::Borrowed("text/plain"),
70
71
  metadata: crate::types::Metadata::default(),
71
72
  tables: vec![],
72
73
  detected_languages: None,
@@ -118,7 +119,7 @@ mod tests {
118
119
 
119
120
  let result = ExtractionResult {
120
121
  content: "test".to_string(),
121
- mime_type: "text/plain".to_string(),
122
+ mime_type: Cow::Borrowed("text/plain"),
122
123
  metadata: crate::types::Metadata::default(),
123
124
  tables: vec![],
124
125
  detected_languages: None,
@@ -187,7 +188,7 @@ mod tests {
187
188
 
188
189
  let mut result = ExtractionResult {
189
190
  content: String::new(),
190
- mime_type: "text/plain".to_string(),
191
+ mime_type: Cow::Borrowed("text/plain"),
191
192
  metadata: crate::types::Metadata::default(),
192
193
  tables: vec![],
193
194
  detected_languages: None,
@@ -211,12 +212,12 @@ mod tests {
211
212
  stage: ProcessingStage::Early,
212
213
  };
213
214
 
214
- let mut additional = HashMap::new();
215
- additional.insert("existing_key".to_string(), serde_json::json!("existing_value"));
215
+ let mut additional = AHashMap::new();
216
+ additional.insert(Cow::Borrowed("existing_key"), serde_json::json!("existing_value"));
216
217
 
217
218
  let mut result = ExtractionResult {
218
219
  content: "test".to_string(),
219
- mime_type: "text/plain".to_string(),
220
+ mime_type: Cow::Borrowed("text/plain"),
220
221
  metadata: crate::types::Metadata {
221
222
  additional,
222
223
  ..Default::default()
@@ -248,7 +249,7 @@ mod tests {
248
249
 
249
250
  let result = ExtractionResult {
250
251
  content: "test".to_string(),
251
- mime_type: "text/plain".to_string(),
252
+ mime_type: Cow::Borrowed("text/plain"),
252
253
  metadata: crate::types::Metadata::default(),
253
254
  tables: vec![],
254
255
  detected_languages: None,
@@ -301,7 +302,7 @@ mod tests {
301
302
 
302
303
  let pdf_result = ExtractionResult {
303
304
  content: "test".to_string(),
304
- mime_type: "application/pdf".to_string(),
305
+ mime_type: Cow::Borrowed("application/pdf"),
305
306
  metadata: crate::types::Metadata::default(),
306
307
  tables: vec![],
307
308
  detected_languages: None,
@@ -314,7 +315,7 @@ mod tests {
314
315
 
315
316
  let txt_result = ExtractionResult {
316
317
  content: "test".to_string(),
317
- mime_type: "text/plain".to_string(),
318
+ mime_type: Cow::Borrowed("text/plain"),
318
319
  metadata: crate::types::Metadata::default(),
319
320
  tables: vec![],
320
321
  detected_languages: None,
@@ -345,7 +346,7 @@ mod tests {
345
346
 
346
347
  let mut result = ExtractionResult {
347
348
  content: "test".to_string(),
348
- mime_type: "text/plain".to_string(),
349
+ mime_type: Cow::Borrowed("text/plain"),
349
350
  metadata: crate::types::Metadata::default(),
350
351
  tables: vec![table],
351
352
  detected_languages: None,
@@ -225,6 +225,7 @@ mod tests {
225
225
  use crate::plugins::Plugin;
226
226
  use crate::types::ExtractionResult;
227
227
  use async_trait::async_trait;
228
+ use std::borrow::Cow;
228
229
 
229
230
  struct MockExtractor {
230
231
  name: String,
@@ -252,7 +253,7 @@ mod tests {
252
253
  async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
253
254
  Ok(ExtractionResult {
254
255
  content: "test".to_string(),
255
- mime_type: "text/plain".to_string(),
256
+ mime_type: Cow::Borrowed("text/plain"),
256
257
  metadata: crate::types::Metadata::default(),
257
258
  tables: vec![],
258
259
  detected_languages: None,
@@ -494,7 +495,7 @@ mod tests {
494
495
  async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
495
496
  Ok(ExtractionResult {
496
497
  content: "test".to_string(),
497
- mime_type: "text/plain".to_string(),
498
+ mime_type: Cow::Borrowed("text/plain"),
498
499
  metadata: crate::types::Metadata::default(),
499
500
  tables: vec![],
500
501
  detected_languages: None,
@@ -191,6 +191,7 @@ mod tests {
191
191
  use crate::plugins::{OcrBackend, Plugin};
192
192
  use crate::types::ExtractionResult;
193
193
  use async_trait::async_trait;
194
+ use std::borrow::Cow;
194
195
 
195
196
  struct MockOcrBackend {
196
197
  name: String,
@@ -217,7 +218,7 @@ mod tests {
217
218
  async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
218
219
  Ok(ExtractionResult {
219
220
  content: "test".to_string(),
220
- mime_type: "text/plain".to_string(),
221
+ mime_type: Cow::Borrowed("text/plain"),
221
222
  metadata: crate::types::Metadata::default(),
222
223
  tables: vec![],
223
224
  detected_languages: None,
@@ -344,7 +345,7 @@ mod tests {
344
345
  async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
345
346
  Ok(ExtractionResult {
346
347
  content: "test".to_string(),
347
- mime_type: "text/plain".to_string(),
348
+ mime_type: Cow::Borrowed("text/plain"),
348
349
  metadata: crate::types::Metadata::default(),
349
350
  tables: vec![],
350
351
  detected_languages: None,
@@ -19,8 +19,9 @@ mod tests {
19
19
  use crate::core::config::ExtractionConfig;
20
20
  use crate::plugins::Plugin;
21
21
  use crate::types::ExtractionResult;
22
+ use ahash::AHashMap;
22
23
  use async_trait::async_trait;
23
- use std::collections::HashMap;
24
+ use std::borrow::Cow;
24
25
 
25
26
  struct MockValidator {
26
27
  should_fail: bool,
@@ -61,7 +62,7 @@ mod tests {
61
62
 
62
63
  let result = ExtractionResult {
63
64
  content: "test content".to_string(),
64
- mime_type: "text/plain".to_string(),
65
+ mime_type: Cow::Borrowed("text/plain"),
65
66
  metadata: crate::types::Metadata::default(),
66
67
  tables: vec![],
67
68
  detected_languages: None,
@@ -82,7 +83,7 @@ mod tests {
82
83
 
83
84
  let result = ExtractionResult {
84
85
  content: "test content".to_string(),
85
- mime_type: "text/plain".to_string(),
86
+ mime_type: Cow::Borrowed("text/plain"),
86
87
  metadata: crate::types::Metadata::default(),
87
88
  tables: vec![],
88
89
  detected_languages: None,
@@ -105,7 +106,7 @@ mod tests {
105
106
 
106
107
  let result = ExtractionResult {
107
108
  content: "test".to_string(),
108
- mime_type: "text/plain".to_string(),
109
+ mime_type: Cow::Borrowed("text/plain"),
109
110
  metadata: crate::types::Metadata::default(),
110
111
  tables: vec![],
111
112
  detected_languages: None,
@@ -143,7 +144,7 @@ mod tests {
143
144
 
144
145
  let result = ExtractionResult {
145
146
  content: String::new(),
146
- mime_type: "text/plain".to_string(),
147
+ mime_type: Cow::Borrowed("text/plain"),
147
148
  metadata: crate::types::Metadata::default(),
148
149
  tables: vec![],
149
150
  detected_languages: None,
@@ -193,7 +194,7 @@ mod tests {
193
194
 
194
195
  let pdf_result = ExtractionResult {
195
196
  content: "test".to_string(),
196
- mime_type: "application/pdf".to_string(),
197
+ mime_type: Cow::Borrowed("application/pdf"),
197
198
  metadata: crate::types::Metadata::default(),
198
199
  tables: vec![],
199
200
  detected_languages: None,
@@ -206,7 +207,7 @@ mod tests {
206
207
 
207
208
  let txt_result = ExtractionResult {
208
209
  content: "test".to_string(),
209
- mime_type: "text/plain".to_string(),
210
+ mime_type: Cow::Borrowed("text/plain"),
210
211
  metadata: crate::types::Metadata::default(),
211
212
  tables: vec![],
212
213
  detected_languages: None,
@@ -292,7 +293,7 @@ mod tests {
292
293
 
293
294
  let result = ExtractionResult {
294
295
  content: "test".to_string(),
295
- mime_type: "text/plain".to_string(),
296
+ mime_type: Cow::Borrowed("text/plain"),
296
297
  metadata: crate::types::Metadata::default(),
297
298
  tables: vec![],
298
299
  detected_languages: None,
@@ -318,12 +319,12 @@ mod tests {
318
319
  async fn test_validator_with_metadata() {
319
320
  let validator = MockValidator { should_fail: false };
320
321
 
321
- let mut additional = HashMap::new();
322
- additional.insert("quality_score".to_string(), serde_json::json!(0.95));
322
+ let mut additional = AHashMap::new();
323
+ additional.insert(Cow::Borrowed("quality_score"), serde_json::json!(0.95));
323
324
 
324
325
  let result = ExtractionResult {
325
326
  content: "test".to_string(),
326
- mime_type: "text/plain".to_string(),
327
+ mime_type: Cow::Borrowed("text/plain"),
327
328
  metadata: crate::types::Metadata {
328
329
  additional,
329
330
  ..Default::default()
@@ -355,7 +356,7 @@ mod tests {
355
356
 
356
357
  let result = ExtractionResult {
357
358
  content: "test".to_string(),
358
- mime_type: "text/plain".to_string(),
359
+ mime_type: Cow::Borrowed("text/plain"),
359
360
  metadata: crate::types::Metadata::default(),
360
361
  tables: vec![table],
361
362
  detected_languages: None,
@@ -386,7 +387,7 @@ mod tests {
386
387
  for mime_type in mime_types {
387
388
  let result = ExtractionResult {
388
389
  content: "test".to_string(),
389
- mime_type: mime_type.to_string(),
390
+ mime_type: Cow::Borrowed(mime_type),
390
391
  metadata: crate::types::Metadata::default(),
391
392
  tables: vec![],
392
393
  detected_languages: None,
@@ -407,7 +408,7 @@ mod tests {
407
408
 
408
409
  let result = ExtractionResult {
409
410
  content: "test content ".repeat(10000),
410
- mime_type: "text/plain".to_string(),
411
+ mime_type: Cow::Borrowed("text/plain"),
411
412
  metadata: crate::types::Metadata::default(),
412
413
  tables: vec![],
413
414
  detected_languages: None,
@@ -1,7 +1,7 @@
1
+ use ahash::AHashMap;
1
2
  use once_cell::sync::Lazy;
2
3
  use regex::Regex;
3
4
  use std::borrow::Cow;
4
- use std::collections::HashMap;
5
5
 
6
6
  use crate::utils::quality::{collapse_scattered_ascii, normalize_whitespace_ascii};
7
7
 
@@ -123,7 +123,7 @@ where
123
123
  }
124
124
  }
125
125
 
126
- pub fn calculate_quality_score(text: &str, metadata: Option<&HashMap<String, serde_json::Value>>) -> f64 {
126
+ pub fn calculate_quality_score(text: &str, metadata: Option<&AHashMap<Cow<'static, str>, serde_json::Value>>) -> f64 {
127
127
  if text.is_empty() || text.trim().is_empty() {
128
128
  return 0.0;
129
129
  }
@@ -266,7 +266,7 @@ fn calculate_structure_bonus(text: &str) -> f64 {
266
266
  }
267
267
 
268
268
  #[inline]
269
- fn calculate_metadata_bonus(metadata: &HashMap<String, serde_json::Value>) -> f64 {
269
+ fn calculate_metadata_bonus(metadata: &AHashMap<Cow<'static, str>, serde_json::Value>) -> f64 {
270
270
  const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
271
271
 
272
272
  let present_fields = IMPORTANT_FIELDS
@@ -491,9 +491,9 @@ mod tests {
491
491
  #[test]
492
492
  fn test_calculate_quality_score_with_metadata() {
493
493
  let text = "This is a normal text with proper structure.";
494
- let mut metadata = HashMap::new();
495
- metadata.insert("title".to_string(), serde_json::json!("Test Title"));
496
- metadata.insert("author".to_string(), serde_json::json!("Test Author"));
494
+ let mut metadata: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
495
+ metadata.insert(Cow::Borrowed("title"), serde_json::json!("Test Title"));
496
+ metadata.insert(Cow::Borrowed("author"), serde_json::json!("Test Author"));
497
497
 
498
498
  let score = calculate_quality_score(text, Some(&metadata));
499
499
  assert!(score > 0.0);
@@ -558,19 +558,19 @@ mod tests {
558
558
 
559
559
  #[test]
560
560
  fn test_calculate_metadata_bonus_empty() {
561
- let metadata = HashMap::new();
561
+ let metadata: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
562
562
  let bonus = calculate_metadata_bonus(&metadata);
563
563
  assert_eq!(bonus, 0.0);
564
564
  }
565
565
 
566
566
  #[test]
567
567
  fn test_calculate_metadata_bonus_full() {
568
- let mut metadata = HashMap::new();
569
- metadata.insert("title".to_string(), serde_json::json!("Title"));
570
- metadata.insert("author".to_string(), serde_json::json!("Author"));
571
- metadata.insert("subject".to_string(), serde_json::json!("Subject"));
572
- metadata.insert("description".to_string(), serde_json::json!("Description"));
573
- metadata.insert("keywords".to_string(), serde_json::json!("Keywords"));
568
+ let mut metadata: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
569
+ metadata.insert(Cow::Borrowed("title"), serde_json::json!("Title"));
570
+ metadata.insert(Cow::Borrowed("author"), serde_json::json!("Author"));
571
+ metadata.insert(Cow::Borrowed("subject"), serde_json::json!("Subject"));
572
+ metadata.insert(Cow::Borrowed("description"), serde_json::json!("Description"));
573
+ metadata.insert(Cow::Borrowed("keywords"), serde_json::json!("Keywords"));
574
574
 
575
575
  let bonus = calculate_metadata_bonus(&metadata);
576
576
  assert_eq!(bonus, 1.0);
@@ -15,6 +15,7 @@
15
15
  use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
16
16
  use crate::{ExtractionConfig, ExtractionResult, Result};
17
17
  use async_trait::async_trait;
18
+ use std::borrow::Cow;
18
19
 
19
20
  /// Post-processor that calculates quality score and cleans text.
20
21
  ///
@@ -65,7 +66,7 @@ impl PostProcessor for QualityProcessor {
65
66
  };
66
67
 
67
68
  result.metadata.additional.insert(
68
- "quality_score".to_string(),
69
+ Cow::Borrowed("quality_score"),
69
70
  serde_json::Value::Number(
70
71
  serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
71
72
  ),
@@ -116,7 +117,7 @@ mod tests {
116
117
 
117
118
  let mut result = ExtractionResult {
118
119
  content: "This is a well-written paragraph with proper structure. It contains multiple sentences. The quality should be good.".to_string(),
119
- mime_type: "text/plain".to_string(),
120
+ mime_type: Cow::Borrowed("text/plain"),
120
121
  metadata: Metadata::default(),
121
122
  tables: vec![],
122
123
  detected_languages: None,
@@ -144,7 +145,7 @@ mod tests {
144
145
 
145
146
  let mut result = ExtractionResult {
146
147
  content: "Some text".to_string(),
147
- mime_type: "text/plain".to_string(),
148
+ mime_type: Cow::Borrowed("text/plain"),
148
149
  metadata: Metadata::default(),
149
150
  tables: vec![],
150
151
  detected_languages: None,
@@ -179,7 +180,7 @@ mod tests {
179
180
 
180
181
  let result = ExtractionResult {
181
182
  content: "Sample text".to_string(),
182
- mime_type: "text/plain".to_string(),
183
+ mime_type: Cow::Borrowed("text/plain"),
183
184
  metadata: Metadata::default(),
184
185
  tables: vec![],
185
186
  detected_languages: None,
@@ -209,7 +210,7 @@ mod tests {
209
210
 
210
211
  let short_result = ExtractionResult {
211
212
  content: "Short".to_string(),
212
- mime_type: "text/plain".to_string(),
213
+ mime_type: Cow::Borrowed("text/plain"),
213
214
  metadata: Metadata::default(),
214
215
  tables: vec![],
215
216
  detected_languages: None,
@@ -222,7 +223,7 @@ mod tests {
222
223
 
223
224
  let long_result = ExtractionResult {
224
225
  content: "a".repeat(1000000),
225
- mime_type: "text/plain".to_string(),
226
+ mime_type: Cow::Borrowed("text/plain"),
226
227
  metadata: Metadata::default(),
227
228
  tables: vec![],
228
229
  detected_languages: None,