kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -4,10 +4,12 @@
4
4
  //! and recursive processing of `<outline>` elements in the `<body>` section.
5
5
 
6
6
  use crate::Result;
7
- use std::collections::HashMap;
7
+ use ahash::AHashMap;
8
+ use std::borrow::Cow;
8
9
 
9
10
  #[cfg(feature = "office")]
10
11
  use roxmltree::Node;
12
+ use serde_json;
11
13
 
12
14
  /// Extract OPML content and metadata from raw bytes.
13
15
  ///
@@ -20,7 +22,9 @@ use roxmltree::Node;
20
22
  /// - Extracted content as a String (outline hierarchy with indentation)
21
23
  /// - Metadata HashMap with key-value pairs from the head section
22
24
  #[cfg(feature = "office")]
23
- pub(crate) fn extract_content_and_metadata(content: &[u8]) -> Result<(String, HashMap<String, serde_json::Value>)> {
25
+ pub(crate) fn extract_content_and_metadata(
26
+ content: &[u8],
27
+ ) -> Result<(String, AHashMap<Cow<'static, str>, serde_json::Value>)> {
24
28
  let doc = roxmltree::Document::parse(
25
29
  std::str::from_utf8(content)
26
30
  .map_err(|e| crate::KreuzbergError::Other(format!("Invalid UTF-8 in OPML: {}", e)))?,
@@ -28,7 +32,7 @@ pub(crate) fn extract_content_and_metadata(content: &[u8]) -> Result<(String, Ha
28
32
  .map_err(|e| crate::KreuzbergError::Other(format!("Failed to parse OPML: {}", e)))?;
29
33
 
30
34
  let mut extracted_content = String::new();
31
- let mut metadata = HashMap::new();
35
+ let mut metadata = AHashMap::new();
32
36
 
33
37
  if let Some(opml) = doc.root().children().find(|n| n.tag_name().name() == "opml") {
34
38
  if let Some(head) = opml.children().find(|n| n.tag_name().name() == "head") {
@@ -60,7 +64,7 @@ pub(crate) fn extract_content_and_metadata(content: &[u8]) -> Result<(String, Ha
60
64
  /// - ownerName: Document owner's name
61
65
  /// - ownerEmail: Document owner's email
62
66
  #[cfg(feature = "office")]
63
- fn extract_metadata_from_head(head: Node, metadata: &mut HashMap<String, serde_json::Value>) {
67
+ fn extract_metadata_from_head(head: Node, metadata: &mut AHashMap<Cow<'static, str>, serde_json::Value>) {
64
68
  for child in head.children().filter(|n| n.is_element()) {
65
69
  let tag = child.tag_name().name();
66
70
  let text = child.text().unwrap_or("").trim();
@@ -71,19 +75,19 @@ fn extract_metadata_from_head(head: Node, metadata: &mut HashMap<String, serde_j
71
75
 
72
76
  match tag {
73
77
  "title" => {
74
- metadata.insert("title".to_string(), serde_json::json!(text));
78
+ metadata.insert(Cow::Borrowed("title"), serde_json::json!(text));
75
79
  }
76
80
  "dateCreated" => {
77
- metadata.insert("dateCreated".to_string(), serde_json::json!(text));
81
+ metadata.insert(Cow::Borrowed("dateCreated"), serde_json::json!(text));
78
82
  }
79
83
  "dateModified" => {
80
- metadata.insert("dateModified".to_string(), serde_json::json!(text));
84
+ metadata.insert(Cow::Borrowed("dateModified"), serde_json::json!(text));
81
85
  }
82
86
  "ownerName" => {
83
- metadata.insert("ownerName".to_string(), serde_json::json!(text));
87
+ metadata.insert(Cow::Borrowed("ownerName"), serde_json::json!(text));
84
88
  }
85
89
  "ownerEmail" => {
86
- metadata.insert("ownerEmail".to_string(), serde_json::json!(text));
90
+ metadata.insert(Cow::Borrowed("ownerEmail"), serde_json::json!(text));
87
91
  }
88
92
  _ => {}
89
93
  }
@@ -23,9 +23,11 @@ use crate::plugins::{DocumentExtractor, Plugin};
23
23
  #[cfg(feature = "office")]
24
24
  use crate::types::{ExtractionResult, Metadata, Table};
25
25
  #[cfg(feature = "office")]
26
+ use ahash::AHashMap;
27
+ #[cfg(feature = "office")]
26
28
  use async_trait::async_trait;
27
29
  #[cfg(feature = "office")]
28
- use std::collections::HashMap;
30
+ use std::borrow::Cow;
29
31
 
30
32
  #[cfg(feature = "office")]
31
33
  use org::Org;
@@ -57,33 +59,33 @@ impl OrgModeExtractor {
57
59
  /// Also extracts document structure and content in parallel.
58
60
  fn extract_metadata_and_content(org_text: &str, org: &Org) -> (Metadata, String) {
59
61
  let mut metadata = Metadata::default();
60
- let mut additional = HashMap::new();
62
+ let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = Default::default();
61
63
 
62
64
  for line in org_text.lines().take(100) {
63
65
  let trimmed = line.trim();
64
66
 
65
67
  if let Some(rest) = trimmed.strip_prefix("#+TITLE:") {
66
68
  let value = rest.trim().to_string();
67
- additional.insert("title".to_string(), serde_json::json!(value));
69
+ additional.insert(Cow::Borrowed("title"), serde_json::json!(value));
68
70
  } else if let Some(rest) = trimmed.strip_prefix("#+AUTHOR:") {
69
71
  let value = rest.trim().to_string();
70
- additional.insert("author".to_string(), serde_json::json!(&value));
71
- additional.insert("authors".to_string(), serde_json::json!(vec![value]));
72
+ additional.insert(Cow::Borrowed("author"), serde_json::json!(&value));
73
+ additional.insert(Cow::Borrowed("authors"), serde_json::json!(vec![value]));
72
74
  } else if let Some(rest) = trimmed.strip_prefix("#+DATE:") {
73
75
  let value = rest.trim().to_string();
74
76
  metadata.created_at = Some(value.clone());
75
- additional.insert("date".to_string(), serde_json::json!(value));
77
+ additional.insert(Cow::Borrowed("date"), serde_json::json!(value));
76
78
  } else if let Some(rest) = trimmed.strip_prefix("#+KEYWORDS:") {
77
79
  let value = rest.trim();
78
80
  let keywords: Vec<&str> = value.split(',').map(|s| s.trim()).collect();
79
- additional.insert("keywords".to_string(), serde_json::json!(keywords));
81
+ additional.insert(Cow::Borrowed("keywords"), serde_json::json!(keywords));
80
82
  } else if let Some(rest) = trimmed.strip_prefix("#+")
81
83
  && let Some((key, val)) = rest.split_once(':')
82
84
  {
83
85
  let key_lower = key.trim().to_lowercase();
84
86
  let value = val.trim();
85
87
  if !key_lower.is_empty() && !value.is_empty() {
86
- additional.insert(format!("directive_{}", key_lower), serde_json::json!(value));
88
+ additional.insert(Cow::Owned(format!("directive_{}", key_lower)), serde_json::json!(value));
87
89
  }
88
90
  }
89
91
  }
@@ -298,7 +300,7 @@ impl DocumentExtractor for OrgModeExtractor {
298
300
 
299
301
  Ok(ExtractionResult {
300
302
  content: extracted_content,
301
- mime_type: mime_type.to_string(),
303
+ mime_type: mime_type.to_string().into(),
302
304
  metadata,
303
305
  tables,
304
306
  detected_languages: None,
@@ -7,6 +7,8 @@ mod extraction;
7
7
  mod ocr;
8
8
  mod pages;
9
9
 
10
+ use bytes::Bytes;
11
+
10
12
  use crate::Result;
11
13
  use crate::core::config::ExtractionConfig;
12
14
  use crate::plugins::{DocumentExtractor, Plugin};
@@ -236,9 +238,14 @@ impl DocumentExtractor for PdfExtractor {
236
238
  .into_iter()
237
239
  .enumerate()
238
240
  .map(|(idx, img)| {
239
- let format = img.filters.first().cloned().unwrap_or_else(|| "unknown".to_string());
241
+ let format = img
242
+ .filters
243
+ .first()
244
+ .cloned()
245
+ .map(std::borrow::Cow::Owned)
246
+ .unwrap_or(std::borrow::Cow::Borrowed("unknown"));
240
247
  crate::types::ExtractedImage {
241
- data: img.data,
248
+ data: Bytes::from(img.data),
242
249
  format,
243
250
  image_index: idx,
244
251
  page_number: Some(img.page_number),
@@ -265,7 +272,7 @@ impl DocumentExtractor for PdfExtractor {
265
272
 
266
273
  Ok(ExtractionResult {
267
274
  content: text,
268
- mime_type: mime_type.to_string(),
275
+ mime_type: mime_type.to_string().into(),
269
276
  metadata: Metadata {
270
277
  #[cfg(feature = "pdf")]
271
278
  title: pdf_metadata.title.clone(),
@@ -6,7 +6,9 @@ use crate::Result;
6
6
  use crate::core::config::ExtractionConfig;
7
7
  use crate::plugins::{DocumentExtractor, Plugin};
8
8
  use crate::types::{ExtractionResult, Metadata};
9
+ use ahash::AHashMap;
9
10
  use async_trait::async_trait;
11
+ use std::borrow::Cow;
10
12
  use std::path::Path;
11
13
 
12
14
  #[cfg(feature = "ocr")]
@@ -66,7 +68,7 @@ impl PptxExtractor {
66
68
  Ok(ocr_extraction) => {
67
69
  let extraction_result = ExtractionResult {
68
70
  content: ocr_extraction.content,
69
- mime_type: ocr_extraction.mime_type,
71
+ mime_type: ocr_extraction.mime_type.into(),
70
72
  metadata: Metadata::default(),
71
73
  tables: vec![],
72
74
  detected_languages: None,
@@ -137,10 +139,10 @@ impl DocumentExtractor for PptxExtractor {
137
139
  crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images, config.pages.as_ref())?
138
140
  };
139
141
 
140
- let mut additional = std::collections::HashMap::new();
141
- additional.insert("slide_count".to_string(), serde_json::json!(pptx_result.slide_count));
142
- additional.insert("image_count".to_string(), serde_json::json!(pptx_result.image_count));
143
- additional.insert("table_count".to_string(), serde_json::json!(pptx_result.table_count));
142
+ let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
143
+ additional.insert(Cow::Borrowed("slide_count"), serde_json::json!(pptx_result.slide_count));
144
+ additional.insert(Cow::Borrowed("image_count"), serde_json::json!(pptx_result.image_count));
145
+ additional.insert(Cow::Borrowed("table_count"), serde_json::json!(pptx_result.table_count));
144
146
 
145
147
  let images = if extract_images {
146
148
  // Image extraction is enabled, return images or empty vector
@@ -174,7 +176,7 @@ impl DocumentExtractor for PptxExtractor {
174
176
 
175
177
  Ok(ExtractionResult {
176
178
  content: pptx_result.content,
177
- mime_type: mime_type.to_string(),
179
+ mime_type: mime_type.to_string().into(),
178
180
  metadata,
179
181
  pages: pptx_result.page_contents,
180
182
  tables: vec![],
@@ -202,10 +204,10 @@ impl DocumentExtractor for PptxExtractor {
202
204
  let pptx_result =
203
205
  crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images, config.pages.as_ref())?;
204
206
 
205
- let mut additional = std::collections::HashMap::new();
206
- additional.insert("slide_count".to_string(), serde_json::json!(pptx_result.slide_count));
207
- additional.insert("image_count".to_string(), serde_json::json!(pptx_result.image_count));
208
- additional.insert("table_count".to_string(), serde_json::json!(pptx_result.table_count));
207
+ let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
208
+ additional.insert(Cow::Borrowed("slide_count"), serde_json::json!(pptx_result.slide_count));
209
+ additional.insert(Cow::Borrowed("image_count"), serde_json::json!(pptx_result.image_count));
210
+ additional.insert(Cow::Borrowed("table_count"), serde_json::json!(pptx_result.table_count));
209
211
 
210
212
  let images = if extract_images {
211
213
  // Image extraction is enabled, return images or empty vector
@@ -239,7 +241,7 @@ impl DocumentExtractor for PptxExtractor {
239
241
 
240
242
  Ok(ExtractionResult {
241
243
  content: pptx_result.content,
242
- mime_type: mime_type.to_string(),
244
+ mime_type: mime_type.to_string().into(),
243
245
  metadata,
244
246
  pages: pptx_result.page_contents,
245
247
  tables: vec![],
@@ -21,9 +21,11 @@ use crate::plugins::{DocumentExtractor, Plugin};
21
21
  #[cfg(feature = "office")]
22
22
  use crate::types::{ExtractionResult, Metadata, Table};
23
23
  #[cfg(feature = "office")]
24
+ use ahash::AHashMap;
25
+ #[cfg(feature = "office")]
24
26
  use async_trait::async_trait;
25
27
  #[cfg(feature = "office")]
26
- use std::collections::HashMap;
28
+ use std::borrow::Cow;
27
29
 
28
30
  /// Native Rust reStructuredText extractor.
29
31
  ///
@@ -48,7 +50,7 @@ impl RstExtractor {
48
50
  /// Uses document tree parsing and fallback text extraction.
49
51
  fn extract_text_and_metadata(content: &str) -> (String, Metadata) {
50
52
  let mut metadata = Metadata::default();
51
- let mut additional = HashMap::new();
53
+ let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
52
54
 
53
55
  let text = Self::extract_text_from_rst(content, &mut additional);
54
56
 
@@ -60,7 +62,7 @@ impl RstExtractor {
60
62
  ///
61
63
  /// This is the main extraction engine that processes RST line-by-line
62
64
  /// and extracts all document content including headings, code blocks, lists, etc.
63
- fn extract_text_from_rst(content: &str, metadata: &mut HashMap<String, serde_json::Value>) -> String {
65
+ fn extract_text_from_rst(content: &str, metadata: &mut AHashMap<Cow<'static, str>, serde_json::Value>) -> String {
64
66
  let mut output = String::new();
65
67
  let lines: Vec<&str> = content.lines().collect();
66
68
  let mut i = 0;
@@ -228,24 +230,24 @@ impl RstExtractor {
228
230
  }
229
231
 
230
232
  /// Add a metadata field from RST field list.
231
- fn add_metadata_field(key: &str, value: &str, metadata: &mut HashMap<String, serde_json::Value>) {
233
+ fn add_metadata_field(key: &str, value: &str, metadata: &mut AHashMap<Cow<'static, str>, serde_json::Value>) {
232
234
  let key_lower = key.to_lowercase();
233
235
  match key_lower.as_str() {
234
236
  "author" | "authors" => {
235
- metadata.insert("author".to_string(), serde_json::Value::String(value.to_string()));
237
+ metadata.insert(Cow::Borrowed("author"), serde_json::Value::String(value.to_string()));
236
238
  }
237
239
  "date" => {
238
- metadata.insert("date".to_string(), serde_json::Value::String(value.to_string()));
240
+ metadata.insert(Cow::Borrowed("date"), serde_json::Value::String(value.to_string()));
239
241
  }
240
242
  "version" | "revision" => {
241
- metadata.insert("version".to_string(), serde_json::Value::String(value.to_string()));
243
+ metadata.insert(Cow::Borrowed("version"), serde_json::Value::String(value.to_string()));
242
244
  }
243
245
  "title" => {
244
- metadata.insert("title".to_string(), serde_json::Value::String(value.to_string()));
246
+ metadata.insert(Cow::Borrowed("title"), serde_json::Value::String(value.to_string()));
245
247
  }
246
248
  _ => {
247
249
  metadata.insert(
248
- format!("field_{}", key_lower),
250
+ Cow::Owned(format!("field_{}", key_lower)),
249
251
  serde_json::Value::String(value.to_string()),
250
252
  );
251
253
  }
@@ -447,7 +449,7 @@ impl DocumentExtractor for RstExtractor {
447
449
 
448
450
  Ok(ExtractionResult {
449
451
  content: extracted_text,
450
- mime_type: mime_type.to_string(),
452
+ mime_type: mime_type.to_string().into(),
451
453
  metadata,
452
454
  tables,
453
455
  detected_languages: None,
@@ -504,7 +506,7 @@ This is a paragraph.
504
506
  Another paragraph.
505
507
  "#;
506
508
 
507
- let mut metadata = HashMap::new();
509
+ let mut metadata = AHashMap::new();
508
510
  let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
509
511
  assert!(output.contains("Title"));
510
512
  assert!(output.contains("This is a paragraph"));
@@ -522,7 +524,7 @@ Another paragraph.
522
524
  Some text after.
523
525
  "#;
524
526
 
525
- let mut metadata = HashMap::new();
527
+ let mut metadata = AHashMap::new();
526
528
  let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
527
529
  assert!(output.contains("code-block"));
528
530
  assert!(output.contains("def hello"));
@@ -540,7 +542,7 @@ First paragraph.
540
542
  Second paragraph.
541
543
  "#;
542
544
 
543
- let mut metadata = HashMap::new();
545
+ let mut metadata = AHashMap::new();
544
546
  let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
545
547
  assert!(output.contains("First paragraph"));
546
548
  assert!(output.contains("Second paragraph"));
@@ -1,8 +1,9 @@
1
1
  //! Metadata extraction from RTF documents.
2
2
 
3
3
  use crate::extractors::rtf::encoding::parse_rtf_control_word;
4
+ use ahash::AHashMap;
4
5
  use serde_json::Value;
5
- use std::collections::HashMap;
6
+ use std::borrow::Cow;
6
7
 
7
8
  /// Parse a `{\\creatim ...}` or `{\\revtim ...}` RTF info block into ISO 8601 format.
8
9
  pub fn parse_rtf_datetime(segment: &str) -> Option<String> {
@@ -45,8 +46,8 @@ pub fn parse_rtf_datetime(segment: &str) -> Option<String> {
45
46
  }
46
47
 
47
48
  /// Extract metadata from the RTF `\\info` block and augment with computed statistics.
48
- pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<String, Value> {
49
- let mut metadata: HashMap<String, Value> = HashMap::new();
49
+ pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> AHashMap<Cow<'static, str>, Value> {
50
+ let mut metadata: AHashMap<Cow<'static, str>, Value> = AHashMap::new();
50
51
 
51
52
  if let Some(start) = rtf_content.find("{\\info") {
52
53
  let slice = &rtf_content[start..];
@@ -120,68 +121,68 @@ pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<
120
121
  "author" => {
121
122
  if !trimmed.is_empty() {
122
123
  let author = trimmed.to_string();
123
- metadata.insert("created_by".to_string(), Value::String(author.clone()));
124
- metadata.insert("authors".to_string(), Value::Array(vec![Value::String(author)]));
124
+ metadata.insert(Cow::Borrowed("created_by"), Value::String(author.clone()));
125
+ metadata.insert(Cow::Borrowed("authors"), Value::Array(vec![Value::String(author)]));
125
126
  }
126
127
  }
127
128
  "operator" => {
128
129
  if !trimmed.is_empty() {
129
- metadata.insert("modified_by".to_string(), Value::String(trimmed.to_string()));
130
+ metadata.insert(Cow::Borrowed("modified_by"), Value::String(trimmed.to_string()));
130
131
  }
131
132
  }
132
133
  "title" => {
133
134
  if !trimmed.is_empty() {
134
- metadata.insert("title".to_string(), Value::String(trimmed.to_string()));
135
+ metadata.insert(Cow::Borrowed("title"), Value::String(trimmed.to_string()));
135
136
  }
136
137
  }
137
138
  "subject" => {
138
139
  if !trimmed.is_empty() {
139
- metadata.insert("subject".to_string(), Value::String(trimmed.to_string()));
140
+ metadata.insert(Cow::Borrowed("subject"), Value::String(trimmed.to_string()));
140
141
  }
141
142
  }
142
143
  "generator" => {
143
144
  if !trimmed.is_empty() {
144
- metadata.insert("generator".to_string(), Value::String(trimmed.to_string()));
145
+ metadata.insert(Cow::Borrowed("generator"), Value::String(trimmed.to_string()));
145
146
  }
146
147
  }
147
148
  "creatim" => {
148
149
  if let Some(dt) = parse_rtf_datetime(trimmed) {
149
- metadata.insert("created_at".to_string(), Value::String(dt));
150
+ metadata.insert(Cow::Borrowed("created_at"), Value::String(dt));
150
151
  }
151
152
  }
152
153
  "revtim" => {
153
154
  if let Some(dt) = parse_rtf_datetime(trimmed) {
154
- metadata.insert("modified_at".to_string(), Value::String(dt));
155
+ metadata.insert(Cow::Borrowed("modified_at"), Value::String(dt));
155
156
  }
156
157
  }
157
158
  "version" => {
158
159
  if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
159
- metadata.insert("revision".to_string(), Value::String(val.to_string()));
160
+ metadata.insert(Cow::Borrowed("revision"), Value::String(val.to_string()));
160
161
  }
161
162
  }
162
163
  "nofpages" => {
163
164
  if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
164
- metadata.insert("page_count".to_string(), Value::Number(val.into()));
165
+ metadata.insert(Cow::Borrowed("page_count"), Value::Number(val.into()));
165
166
  }
166
167
  }
167
168
  "nofwords" => {
168
169
  if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
169
- metadata.insert("word_count".to_string(), Value::Number(val.into()));
170
+ metadata.insert(Cow::Borrowed("word_count"), Value::Number(val.into()));
170
171
  }
171
172
  }
172
173
  "nofchars" => {
173
174
  if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
174
- metadata.insert("character_count".to_string(), Value::Number(val.into()));
175
+ metadata.insert(Cow::Borrowed("character_count"), Value::Number(val.into()));
175
176
  }
176
177
  }
177
178
  "lines" => {
178
179
  if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
179
- metadata.insert("line_count".to_string(), Value::Number(val.into()));
180
+ metadata.insert(Cow::Borrowed("line_count"), Value::Number(val.into()));
180
181
  }
181
182
  }
182
183
  "paragraphs" => {
183
184
  if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
184
- metadata.insert("paragraph_count".to_string(), Value::Number(val.into()));
185
+ metadata.insert(Cow::Borrowed("paragraph_count"), Value::Number(val.into()));
185
186
  }
186
187
  }
187
188
  _ => {}
@@ -193,22 +194,22 @@ pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<
193
194
  if !cleaned_text.is_empty() {
194
195
  let word_count = cleaned_text.split_whitespace().count() as i64;
195
196
  metadata
196
- .entry("word_count".to_string())
197
+ .entry(Cow::Borrowed("word_count"))
197
198
  .or_insert(Value::Number(word_count.into()));
198
199
 
199
200
  let character_count = cleaned_text.chars().count() as i64;
200
201
  metadata
201
- .entry("character_count".to_string())
202
+ .entry(Cow::Borrowed("character_count"))
202
203
  .or_insert(Value::Number(character_count.into()));
203
204
 
204
205
  let line_count = cleaned_text.lines().count() as i64;
205
206
  metadata
206
- .entry("line_count".to_string())
207
+ .entry(Cow::Borrowed("line_count"))
207
208
  .or_insert(Value::Number(line_count.into()));
208
209
 
209
210
  let paragraph_count = cleaned_text.split("\n\n").filter(|p| !p.trim().is_empty()).count() as i64;
210
211
  metadata
211
- .entry("paragraph_count".to_string())
212
+ .entry(Cow::Borrowed("paragraph_count"))
212
213
  .or_insert(Value::Number(paragraph_count.into()));
213
214
  }
214
215
 
@@ -95,7 +95,7 @@ impl DocumentExtractor for RtfExtractor {
95
95
 
96
96
  Ok(ExtractionResult {
97
97
  content: extracted_text,
98
- mime_type: mime_type.to_string(),
98
+ mime_type: mime_type.to_string().into(),
99
99
  metadata: Metadata {
100
100
  additional: metadata_map,
101
101
  ..Default::default()
@@ -4,7 +4,9 @@ use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
5
  use crate::plugins::{DocumentExtractor, Plugin};
6
6
  use crate::types::{ExtractionResult, Metadata};
7
+ use ahash::AHashMap;
7
8
  use async_trait::async_trait;
9
+ use std::borrow::Cow;
8
10
  #[cfg(feature = "tokio-runtime")]
9
11
  use std::path::Path;
10
12
 
@@ -63,20 +65,23 @@ impl DocumentExtractor for StructuredExtractor {
63
65
  _ => return Err(crate::KreuzbergError::UnsupportedFormat(mime_type.to_string())),
64
66
  };
65
67
 
66
- let mut additional = std::collections::HashMap::new();
68
+ let mut additional = AHashMap::new();
67
69
  additional.insert(
68
- "field_count".to_string(),
70
+ Cow::Borrowed("field_count"),
69
71
  serde_json::json!(structured_result.text_fields.len()),
70
72
  );
71
- additional.insert("data_format".to_string(), serde_json::json!(structured_result.format));
73
+ additional.insert(
74
+ Cow::Borrowed("data_format"),
75
+ serde_json::json!(structured_result.format),
76
+ );
72
77
 
73
78
  for (key, value) in structured_result.metadata {
74
- additional.insert(key, serde_json::json!(value));
79
+ additional.insert(Cow::Owned(key), serde_json::json!(value));
75
80
  }
76
81
 
77
82
  Ok(ExtractionResult {
78
83
  content: structured_result.content,
79
- mime_type: mime_type.to_string(),
84
+ mime_type: mime_type.to_string().into(),
80
85
  metadata: Metadata {
81
86
  additional,
82
87
  ..Default::default()
@@ -74,7 +74,7 @@ impl DocumentExtractor for PlainTextExtractor {
74
74
 
75
75
  Ok(ExtractionResult {
76
76
  content: text,
77
- mime_type: mime_type.to_string(),
77
+ mime_type: mime_type.to_string().into(),
78
78
  metadata: crate::types::Metadata {
79
79
  format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
80
80
  line_count,
@@ -169,7 +169,7 @@ impl DocumentExtractor for MarkdownExtractor {
169
169
 
170
170
  Ok(ExtractionResult {
171
171
  content: text_result.content,
172
- mime_type: mime_type.to_string(),
172
+ mime_type: mime_type.to_string().into(),
173
173
  metadata: crate::types::Metadata {
174
174
  format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
175
175
  line_count: text_result.line_count,
@@ -29,6 +29,8 @@ use crate::types::{ExtractionResult, Metadata};
29
29
  use async_trait::async_trait;
30
30
  #[cfg(feature = "office")]
31
31
  use regex::Regex;
32
+ #[cfg(feature = "office")]
33
+ use std::borrow::Cow;
32
34
 
33
35
  /// Typst document extractor
34
36
  #[cfg(feature = "office")]
@@ -106,7 +108,7 @@ impl DocumentExtractor for TypstExtractor {
106
108
 
107
109
  Ok(ExtractionResult {
108
110
  content: text,
109
- mime_type: mime_type.to_string(),
111
+ mime_type: mime_type.to_string().into(),
110
112
  metadata,
111
113
  tables: Vec::new(),
112
114
  detected_languages: None,
@@ -151,11 +153,11 @@ impl TypstParser {
151
153
 
152
154
  fn extract_metadata(&mut self) {
153
155
  if let Some(title) = self.extract_quoted_value("title") {
154
- self.metadata.additional.insert("title".to_string(), title.into());
156
+ self.metadata.additional.insert(Cow::Borrowed("title"), title.into());
155
157
  }
156
158
 
157
159
  if let Some(author) = self.extract_quoted_value("author") {
158
- self.metadata.additional.insert("author".to_string(), author.into());
160
+ self.metadata.additional.insert(Cow::Borrowed("author"), author.into());
159
161
  }
160
162
 
161
163
  if let Some(date) = self.extract_quoted_value("date") {
@@ -163,11 +165,15 @@ impl TypstParser {
163
165
  }
164
166
 
165
167
  if let Some(subject) = self.extract_quoted_value("subject") {
166
- self.metadata.additional.insert("subject".to_string(), subject.into());
168
+ self.metadata
169
+ .additional
170
+ .insert(Cow::Borrowed("subject"), subject.into());
167
171
  }
168
172
 
169
173
  if let Some(keywords) = self.extract_keywords() {
170
- self.metadata.additional.insert("keywords".to_string(), keywords.into());
174
+ self.metadata
175
+ .additional
176
+ .insert(Cow::Borrowed("keywords"), keywords.into());
171
177
  }
172
178
  }
173
179
 
@@ -58,7 +58,7 @@ impl SyncExtractor for XmlExtractor {
58
58
 
59
59
  Ok(ExtractionResult {
60
60
  content: xml_result.content,
61
- mime_type: mime_type.to_string(),
61
+ mime_type: mime_type.to_string().into(),
62
62
  metadata: crate::types::Metadata {
63
63
  format: Some(crate::types::FormatMetadata::Xml(crate::types::XmlMetadata {
64
64
  element_count: xml_result.element_count,