kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -21,6 +21,8 @@ use crate::types::{Element, ExtractionResult};
21
21
  use content::{
22
22
  add_page_break, format_table_as_text, process_content, process_hierarchy, process_images, process_tables,
23
23
  };
24
+ #[cfg(test)]
25
+ use std::borrow::Cow;
24
26
 
25
27
  /// Transform an extraction result into semantic elements.
26
28
  ///
@@ -117,7 +119,7 @@ pub fn transform_extraction_result_to_elements(result: &ExtractionResult) -> Vec
117
119
  element_index: Some(elements.len()),
118
120
  additional: {
119
121
  let mut m = std::collections::HashMap::new();
120
- m.insert("format".to_string(), image.format.clone());
122
+ m.insert("format".to_string(), image.format.to_string());
121
123
  if let Some(width) = image.width {
122
124
  m.insert("width".to_string(), width.to_string());
123
125
  }
@@ -138,6 +140,7 @@ pub fn transform_extraction_result_to_elements(result: &ExtractionResult) -> Vec
138
140
  #[cfg(test)]
139
141
  mod tests {
140
142
  use super::*;
143
+ use bytes::Bytes;
141
144
 
142
145
  #[test]
143
146
  fn test_detect_bullet_items() {
@@ -262,7 +265,7 @@ mod tests {
262
265
  // Create a mock result with pages and hierarchy
263
266
  let result = ExtractionResult {
264
267
  content: "Full document content".to_string(),
265
- mime_type: "application/pdf".to_string(),
268
+ mime_type: Cow::Borrowed("application/pdf"),
266
269
  metadata: test_metadata(Some("Test Document".to_string())),
267
270
  tables: vec![],
268
271
  detected_languages: None,
@@ -358,8 +361,8 @@ mod tests {
358
361
  };
359
362
 
360
363
  let image = ExtractedImage {
361
- data: vec![1, 2, 3, 4],
362
- format: "jpeg".to_string(),
364
+ data: Bytes::from_static(&[1, 2, 3, 4]),
365
+ format: std::borrow::Cow::Borrowed("jpeg"),
363
366
  image_index: 0,
364
367
  page_number: Some(1),
365
368
  width: Some(640),
@@ -373,7 +376,7 @@ mod tests {
373
376
 
374
377
  let result = ExtractionResult {
375
378
  content: "Test content".to_string(),
376
- mime_type: "application/pdf".to_string(),
379
+ mime_type: Cow::Borrowed("application/pdf"),
377
380
  metadata: test_metadata(Some("Test".to_string())),
378
381
  tables: vec![],
379
382
  detected_languages: None,
@@ -421,7 +424,7 @@ mod tests {
421
424
  // Create a result without pages
422
425
  let result = ExtractionResult {
423
426
  content: "Simple text content\n\nSecond paragraph".to_string(),
424
- mime_type: "text/plain".to_string(),
427
+ mime_type: Cow::Borrowed("text/plain"),
425
428
  metadata: test_metadata(Some("Simple Doc".to_string())),
426
429
  tables: vec![],
427
430
  detected_languages: None,
@@ -453,7 +456,7 @@ mod tests {
453
456
 
454
457
  let result = ExtractionResult {
455
458
  content: "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.".to_string(),
456
- mime_type: "text/plain".to_string(),
459
+ mime_type: Cow::Borrowed("text/plain"),
457
460
  metadata: test_metadata(None),
458
461
  tables: vec![],
459
462
  detected_languages: None,
@@ -8,7 +8,9 @@ use crate::extraction::archive::{
8
8
  };
9
9
  use crate::plugins::{DocumentExtractor, Plugin};
10
10
  use crate::types::{ArchiveMetadata, ExtractionResult, Metadata};
11
+ use ahash::AHashMap;
11
12
  use async_trait::async_trait;
13
+ use std::borrow::Cow;
12
14
  use std::collections::HashMap;
13
15
 
14
16
  /// Build an ExtractionResult from archive metadata and text contents.
@@ -18,7 +20,7 @@ use std::collections::HashMap;
18
20
  fn build_archive_result(
19
21
  extraction_metadata: ExtractedMetadata,
20
22
  text_contents: HashMap<String, String>,
21
- format_name: &str,
23
+ format_name: &'static str,
22
24
  mime_type: &str,
23
25
  ) -> ExtractionResult {
24
26
  let file_names: Vec<String> = extraction_metadata
@@ -28,14 +30,14 @@ fn build_archive_result(
28
30
  .collect();
29
31
 
30
32
  let archive_metadata = ArchiveMetadata {
31
- format: format_name.to_string(),
33
+ format: Cow::Borrowed(format_name),
32
34
  file_count: extraction_metadata.file_count,
33
35
  file_list: file_names,
34
36
  total_size: extraction_metadata.total_size as usize,
35
37
  compressed_size: None,
36
38
  };
37
39
 
38
- let mut additional = HashMap::new();
40
+ let mut additional = AHashMap::new();
39
41
  let file_details: Vec<serde_json::Value> = extraction_metadata
40
42
  .file_list
41
43
  .iter()
@@ -47,7 +49,7 @@ fn build_archive_result(
47
49
  })
48
50
  })
49
51
  .collect();
50
- additional.insert("files".to_string(), serde_json::json!(file_details));
52
+ additional.insert(Cow::Borrowed("files"), serde_json::json!(file_details));
51
53
 
52
54
  let mut output = format!(
53
55
  "{} Archive ({} files, {} bytes)\n\n",
@@ -67,7 +69,7 @@ fn build_archive_result(
67
69
 
68
70
  ExtractionResult {
69
71
  content: output,
70
- mime_type: mime_type.to_string(),
72
+ mime_type: mime_type.to_string().into(),
71
73
  metadata: Metadata {
72
74
  format: Some(crate::types::FormatMetadata::Archive(archive_metadata)),
73
75
  additional,
@@ -7,8 +7,10 @@ use crate::Result;
7
7
  use crate::core::config::ExtractionConfig;
8
8
  use crate::plugins::{DocumentExtractor, Plugin};
9
9
  use crate::types::{ExtractionResult, Metadata};
10
+ use ahash::AHashMap;
10
11
  use async_trait::async_trait;
11
- use std::collections::{HashMap, HashSet};
12
+ use std::borrow::Cow;
13
+ use std::collections::HashSet;
12
14
 
13
15
  #[cfg(feature = "office")]
14
16
  use biblatex::{Bibliography, ChunksExt};
@@ -79,7 +81,7 @@ impl DocumentExtractor for BibtexExtractor {
79
81
  let mut entries_vec = Vec::new();
80
82
  let mut authors_set = HashSet::new();
81
83
  let mut years_set = HashSet::new();
82
- let mut entry_types_map = HashMap::new();
84
+ let mut entry_types_map: AHashMap<String, i32> = AHashMap::new();
83
85
  let mut formatted_entries = String::new();
84
86
 
85
87
  match Bibliography::parse(&bibtex_str) {
@@ -129,19 +131,19 @@ impl DocumentExtractor for BibtexExtractor {
129
131
  }
130
132
  }
131
133
 
132
- let mut additional = HashMap::new();
134
+ let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
133
135
 
134
- additional.insert("entry_count".to_string(), serde_json::json!(entries_vec.len()));
136
+ additional.insert(Cow::Borrowed("entry_count"), serde_json::json!(entries_vec.len()));
135
137
 
136
138
  let mut authors_list: Vec<String> = authors_set.into_iter().collect();
137
139
  authors_list.sort();
138
- additional.insert("authors".to_string(), serde_json::json!(authors_list));
140
+ additional.insert(Cow::Borrowed("authors"), serde_json::json!(authors_list));
139
141
 
140
142
  if !years_set.is_empty() {
141
143
  let min_year = years_set.iter().min().copied().unwrap_or(0);
142
144
  let max_year = years_set.iter().max().copied().unwrap_or(0);
143
145
  additional.insert(
144
- "year_range".to_string(),
146
+ Cow::Borrowed("year_range"),
145
147
  serde_json::json!({
146
148
  "min": min_year,
147
149
  "max": max_year,
@@ -155,14 +157,14 @@ impl DocumentExtractor for BibtexExtractor {
155
157
  for (entry_type, count) in entry_types_map {
156
158
  entry_types_json[entry_type] = serde_json::json!(count);
157
159
  }
158
- additional.insert("entry_types".to_string(), entry_types_json);
160
+ additional.insert(Cow::Borrowed("entry_types"), entry_types_json);
159
161
  }
160
162
 
161
- additional.insert("citation_keys".to_string(), serde_json::json!(entries_vec));
163
+ additional.insert(Cow::Borrowed("citation_keys"), serde_json::json!(entries_vec));
162
164
 
163
165
  Ok(ExtractionResult {
164
166
  content: formatted_entries,
165
- mime_type: mime_type.to_string(),
167
+ mime_type: mime_type.to_string().into(),
166
168
  metadata: Metadata {
167
169
  additional,
168
170
  ..Default::default()
@@ -222,7 +224,10 @@ mod tests {
222
224
  assert!(result.content.contains("Sample Title"));
223
225
 
224
226
  let metadata = &result.metadata;
225
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(1)));
227
+ assert_eq!(
228
+ metadata.additional.get(&Cow::Borrowed("entry_count")),
229
+ Some(&serde_json::json!(1))
230
+ );
226
231
  }
227
232
 
228
233
  #[tokio::test]
@@ -258,15 +263,18 @@ mod tests {
258
263
 
259
264
  let metadata = &result.metadata;
260
265
 
261
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(3)));
266
+ assert_eq!(
267
+ metadata.additional.get(&Cow::Borrowed("entry_count")),
268
+ Some(&serde_json::json!(3))
269
+ );
262
270
 
263
- if let Some(keys) = metadata.additional.get("citation_keys")
271
+ if let Some(keys) = metadata.additional.get(&Cow::Borrowed("citation_keys"))
264
272
  && let Some(keys_array) = keys.as_array()
265
273
  {
266
274
  assert_eq!(keys_array.len(), 3);
267
275
  }
268
276
 
269
- if let Some(types) = metadata.additional.get("entry_types") {
277
+ if let Some(types) = metadata.additional.get(&Cow::Borrowed("entry_types")) {
270
278
  assert!(types.get("article").is_some());
271
279
  assert!(types.get("book").is_some());
272
280
  assert!(types.get("inproceedings").is_some());
@@ -330,7 +338,10 @@ mod tests {
330
338
  assert!(result.content.contains("The TeXbook"));
331
339
 
332
340
  let metadata = &result.metadata;
333
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(1)));
341
+ assert_eq!(
342
+ metadata.additional.get(&Cow::Borrowed("entry_count")),
343
+ Some(&serde_json::json!(1))
344
+ );
334
345
 
335
346
  if let Some(year_range) = metadata.additional.get("year_range") {
336
347
  assert_eq!(year_range.get("min"), Some(&serde_json::json!(1984)));
@@ -368,7 +379,10 @@ mod tests {
368
379
  let result = result.expect("Should extract valid metadata");
369
380
  let metadata = &result.metadata;
370
381
 
371
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(3)));
382
+ assert_eq!(
383
+ metadata.additional.get(&Cow::Borrowed("entry_count")),
384
+ Some(&serde_json::json!(3))
385
+ );
372
386
 
373
387
  if let Some(authors) = metadata.additional.get("authors")
374
388
  && let Some(authors_array) = authors.as_array()
@@ -381,7 +395,7 @@ mod tests {
381
395
  assert_eq!(year_range.get("max"), Some(&serde_json::json!(2021)));
382
396
  }
383
397
 
384
- if let Some(types) = metadata.additional.get("entry_types") {
398
+ if let Some(types) = metadata.additional.get(&Cow::Borrowed("entry_types")) {
385
399
  assert_eq!(types.get("article"), Some(&serde_json::json!(2)));
386
400
  assert_eq!(types.get("book"), Some(&serde_json::json!(1)));
387
401
  }
@@ -401,7 +415,10 @@ mod tests {
401
415
  let result = result.expect("Should extract empty bibliography");
402
416
  let metadata = &result.metadata;
403
417
 
404
- assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(0)));
418
+ assert_eq!(
419
+ metadata.additional.get(&Cow::Borrowed("entry_count")),
420
+ Some(&serde_json::json!(0))
421
+ );
405
422
  }
406
423
 
407
424
  #[tokio::test]
@@ -2,8 +2,6 @@
2
2
  //!
3
3
  //! Handles parsing of Djot attributes from jotdown events and string syntax.
4
4
 
5
- use std::collections::HashMap;
6
-
7
5
  /// Parse jotdown attributes into our Attributes representation.
8
6
  ///
9
7
  /// Converts jotdown's internal attribute representation to Kreuzberg's
@@ -14,7 +12,7 @@ pub fn parse_jotdown_attributes(attrs: &jotdown::Attributes) -> crate::types::At
14
12
 
15
13
  let mut id = None;
16
14
  let mut classes = Vec::new();
17
- let mut key_values = HashMap::new();
15
+ let mut key_values = Vec::new();
18
16
 
19
17
  for (kind, value) in attrs.iter() {
20
18
  match kind {
@@ -26,7 +24,7 @@ pub fn parse_jotdown_attributes(attrs: &jotdown::Attributes) -> crate::types::At
26
24
  classes.push(value.to_string());
27
25
  }
28
26
  AttributeKind::Pair { key } => {
29
- key_values.insert(key.to_string(), value.to_string());
27
+ key_values.push((key.to_string(), value.to_string()));
30
28
  }
31
29
  AttributeKind::Comment => {
32
30
  // Comments are ignored in our representation
@@ -49,7 +47,7 @@ pub fn parse_djot_attributes(attr_str: &str) -> crate::types::Attributes {
49
47
  let mut attrs = Attributes {
50
48
  id: None,
51
49
  classes: Vec::new(),
52
- key_values: HashMap::new(),
50
+ key_values: Vec::new(),
53
51
  };
54
52
 
55
53
  // Simple parser for attribute syntax
@@ -66,7 +64,7 @@ pub fn parse_djot_attributes(attr_str: &str) -> crate::types::Attributes {
66
64
  // Key-value pair
67
65
  if let Some((key, value)) = token.split_once('=') {
68
66
  let clean_value = value.trim_matches('"').trim_matches('\'');
69
- attrs.key_values.insert(key.to_string(), clean_value.to_string());
67
+ attrs.key_values.push((key.to_string(), clean_value.to_string()));
70
68
  }
71
69
  }
72
70
  }
@@ -106,12 +104,11 @@ mod tests {
106
104
 
107
105
  #[test]
108
106
  fn test_render_attributes_with_all_parts() {
109
- let mut attrs = crate::types::Attributes {
107
+ let attrs = crate::types::Attributes {
110
108
  id: Some("my-id".to_string()),
111
109
  classes: vec!["class1".to_string(), "class2".to_string()],
112
- key_values: HashMap::new(),
110
+ key_values: vec![("data-test".to_string(), "value".to_string())],
113
111
  };
114
- attrs.key_values.insert("data-test".to_string(), "value".to_string());
115
112
 
116
113
  let rendered = render_attributes(&attrs);
117
114
  assert!(rendered.contains("#my-id"));
@@ -125,7 +122,7 @@ mod tests {
125
122
  let attrs = crate::types::Attributes {
126
123
  id: None,
127
124
  classes: vec![],
128
- key_values: HashMap::new(),
125
+ key_values: Vec::new(),
129
126
  };
130
127
 
131
128
  let rendered = render_attributes(&attrs);
@@ -7,6 +7,8 @@
7
7
 
8
8
  use super::rendering::render_block_to_djot;
9
9
  use jotdown::Parser;
10
+ #[cfg(test)]
11
+ use std::borrow::Cow;
10
12
 
11
13
  /// Convert DjotContent back to djot markup.
12
14
  ///
@@ -150,7 +152,7 @@ mod tests {
150
152
  fn test_extraction_result_to_djot_with_djot_content() {
151
153
  let result = ExtractionResult {
152
154
  content: "Test content".to_string(),
153
- mime_type: "text/djot".to_string(),
155
+ mime_type: Cow::Borrowed("text/djot"),
154
156
  metadata: Metadata::default(),
155
157
  tables: vec![],
156
158
  detected_languages: None,
@@ -191,7 +193,7 @@ mod tests {
191
193
  fn test_extraction_result_to_djot_without_djot_content() {
192
194
  let result = ExtractionResult {
193
195
  content: "Paragraph one\n\nParagraph two".to_string(),
194
- mime_type: "text/plain".to_string(),
196
+ mime_type: Cow::Borrowed("text/plain"),
195
197
  metadata: Metadata::default(),
196
198
  tables: vec![],
197
199
  detected_languages: None,
@@ -9,6 +9,7 @@ use crate::plugins::{DocumentExtractor, Plugin};
9
9
  use crate::types::{ExtractionResult, Metadata};
10
10
  use async_trait::async_trait;
11
11
  use jotdown::{Event, Parser};
12
+ use std::borrow::Cow;
12
13
 
13
14
  /// Djot markup extractor with metadata and table support.
14
15
  ///
@@ -90,7 +91,7 @@ impl DocumentExtractor for DjotExtractor {
90
91
  if !metadata.additional.contains_key("title")
91
92
  && let Some(title) = crate::extractors::frontmatter_utils::extract_title_from_content(&remaining_content)
92
93
  {
93
- metadata.additional.insert("title".to_string(), title.into());
94
+ metadata.additional.insert(Cow::Borrowed("title"), title.into());
94
95
  }
95
96
 
96
97
  // Parse with jotdown and collect events once for extraction
@@ -105,7 +106,7 @@ impl DocumentExtractor for DjotExtractor {
105
106
 
106
107
  Ok(ExtractionResult {
107
108
  content: extracted_text,
108
- mime_type: mime_type.to_string(),
109
+ mime_type: mime_type.to_string().into(),
109
110
  metadata,
110
111
  tables,
111
112
  detected_languages: None,
@@ -135,7 +135,7 @@ pub(super) fn handle_block_start(
135
135
  }
136
136
  Container::TaskListItem { checked } => {
137
137
  let mut attrs = parsed_attrs.unwrap_or_default();
138
- attrs.key_values.insert("checked".to_string(), checked.to_string());
138
+ attrs.key_values.push(("checked".to_string(), checked.to_string()));
139
139
  push_block(
140
140
  state,
141
141
  FormattedBlock {
@@ -14,8 +14,6 @@ use super::text_extraction::extract_text_from_events;
14
14
  use crate::extractors::djot_format::attributes::parse_jotdown_attributes;
15
15
  use crate::types::{Attributes, DjotContent, DjotImage, DjotLink, FormattedBlock};
16
16
  use jotdown::{Container, Event};
17
- use std::collections::HashMap;
18
-
19
17
  /// Extract complete djot content with 100% feature extraction.
20
18
  ///
21
19
  /// Processes ALL djot events to build a rich DjotContent structure including:
@@ -42,7 +40,7 @@ pub fn extract_complete_djot_content(
42
40
  let mut images = Vec::new();
43
41
  let mut links = Vec::new();
44
42
  let mut footnotes = Vec::new();
45
- let attributes_map: HashMap<String, Attributes> = HashMap::new();
43
+ let attributes_map: Vec<(String, Attributes)> = Vec::new();
46
44
 
47
45
  let mut state = ExtractionState::new();
48
46
 
@@ -186,7 +184,7 @@ fn handle_start_event(
186
184
  };
187
185
 
188
186
  // Try block handlers first
189
- if handle_block_start(state, container, attrs, parsed_attrs.clone(), footnotes) {
187
+ if handle_block_start(state, container, attrs, parsed_attrs.as_ref().cloned(), footnotes) {
190
188
  return;
191
189
  }
192
190
 
@@ -9,7 +9,7 @@ use std::collections::HashMap;
9
9
  pub(super) fn handle_footnote_reference(state: &mut ExtractionState, label: &str) {
10
10
  state.flush_text();
11
11
 
12
- let mut meta = HashMap::new();
12
+ let mut meta: HashMap<String, String> = HashMap::new();
13
13
  meta.insert("label".to_string(), label.to_string());
14
14
 
15
15
  state.current_inline_elements.push(InlineElement {
@@ -3,7 +3,6 @@
3
3
  use super::state::ExtractionState;
4
4
  use crate::types::{DjotImage, DjotLink, InlineElement, InlineType};
5
5
  use jotdown::Container;
6
- use std::collections::HashMap;
7
6
 
8
7
  /// Handle start of inline elements.
9
8
  pub(super) fn handle_inline_start(
@@ -123,7 +122,7 @@ pub(super) fn handle_math_end(state: &mut ExtractionState, display: bool) {
123
122
  let math_text = std::mem::take(&mut state.math_content);
124
123
  state.inline_type_stack.pop();
125
124
 
126
- let mut meta = HashMap::new();
125
+ let mut meta: std::collections::HashMap<String, String> = std::collections::HashMap::new();
127
126
  meta.insert("display".to_string(), display.to_string());
128
127
 
129
128
  state.current_inline_elements.push(InlineElement {
@@ -144,7 +143,7 @@ pub(super) fn finalize_inline_element(state: &mut ExtractionState, container: &C
144
143
  if matches!(container, Container::RawInline { .. })
145
144
  && let Some(fmt) = state.raw_format.take()
146
145
  {
147
- let mut m = HashMap::new();
146
+ let mut m: std::collections::HashMap<String, String> = std::collections::HashMap::new();
148
147
  m.insert("format".to_string(), fmt);
149
148
  meta = Some(m);
150
149
  }
@@ -167,7 +166,7 @@ pub(super) fn handle_link_end(state: &mut ExtractionState, url: &str, links: &mu
167
166
  }
168
167
  state.inline_type_stack.pop();
169
168
 
170
- let mut meta = HashMap::new();
169
+ let mut meta: std::collections::HashMap<String, String> = std::collections::HashMap::new();
171
170
  meta.insert("href".to_string(), url.to_string());
172
171
 
173
172
  state.current_inline_elements.push(InlineElement {
@@ -188,7 +187,7 @@ pub(super) fn handle_image_end(state: &mut ExtractionState, src: &str, images: &
188
187
  }
189
188
  state.inline_type_stack.pop();
190
189
 
191
- let mut meta = HashMap::new();
190
+ let mut meta: std::collections::HashMap<String, String> = std::collections::HashMap::new();
192
191
  meta.insert("src".to_string(), src.to_string());
193
192
 
194
193
  state.current_inline_elements.push(InlineElement {
@@ -43,7 +43,7 @@ pub fn extract_tables_from_events(events: &[Event]) -> Vec<Table> {
43
43
  if !current_row.is_empty()
44
44
  && let Some((ref mut rows, _)) = current_table
45
45
  {
46
- rows.push(current_row.clone());
46
+ rows.push(std::mem::take(&mut current_row));
47
47
  }
48
48
  current_row = Vec::new();
49
49
  }
@@ -398,7 +398,7 @@ impl DocumentExtractor for DocbookExtractor {
398
398
 
399
399
  Ok(ExtractionResult {
400
400
  content: extracted_content,
401
- mime_type: mime_type.to_string(),
401
+ mime_type: mime_type.to_string().into(),
402
402
  metadata,
403
403
  tables,
404
404
  detected_languages: None,
@@ -9,7 +9,9 @@ use crate::core::config::ExtractionConfig;
9
9
  use crate::extraction::{cells_to_markdown, office_metadata};
10
10
  use crate::plugins::{DocumentExtractor, Plugin};
11
11
  use crate::types::{ExtractionResult, Metadata, PageBoundary, PageInfo, PageStructure, PageUnitType, Table};
12
+ use ahash::AHashMap;
12
13
  use async_trait::async_trait;
14
+ use std::borrow::Cow;
13
15
  use std::io::Cursor;
14
16
 
15
17
  /// High-performance DOCX extractor using docx-lite.
@@ -181,22 +183,22 @@ impl DocumentExtractor for DocxExtractor {
181
183
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
182
184
  };
183
185
 
184
- let mut metadata_map = std::collections::HashMap::new();
186
+ let mut metadata_map = AHashMap::new();
185
187
  let mut parsed_keywords: Option<Vec<String>> = None;
186
188
 
187
189
  if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
188
190
  if let Some(title) = core.title {
189
- metadata_map.insert("title".to_string(), serde_json::Value::String(title));
191
+ metadata_map.insert(Cow::Borrowed("title"), serde_json::Value::String(title));
190
192
  }
191
193
  if let Some(creator) = core.creator {
192
194
  metadata_map.insert(
193
- "authors".to_string(),
195
+ Cow::Borrowed("authors"),
194
196
  serde_json::Value::Array(vec![serde_json::Value::String(creator.clone())]),
195
197
  );
196
- metadata_map.insert("created_by".to_string(), serde_json::Value::String(creator));
198
+ metadata_map.insert(Cow::Borrowed("created_by"), serde_json::Value::String(creator));
197
199
  }
198
200
  if let Some(subject) = core.subject {
199
- metadata_map.insert("subject".to_string(), serde_json::Value::String(subject));
201
+ metadata_map.insert(Cow::Borrowed("subject"), serde_json::Value::String(subject));
200
202
  }
201
203
  if let Some(keywords) = core.keywords {
202
204
  // Parse comma-separated keywords into Vec<String>
@@ -209,70 +211,76 @@ impl DocumentExtractor for DocxExtractor {
209
211
  );
210
212
  }
211
213
  if let Some(description) = core.description {
212
- metadata_map.insert("description".to_string(), serde_json::Value::String(description));
214
+ metadata_map.insert(Cow::Borrowed("description"), serde_json::Value::String(description));
213
215
  }
214
216
  if let Some(modified_by) = core.last_modified_by {
215
- metadata_map.insert("modified_by".to_string(), serde_json::Value::String(modified_by));
217
+ metadata_map.insert(Cow::Borrowed("modified_by"), serde_json::Value::String(modified_by));
216
218
  }
217
219
  if let Some(created) = core.created {
218
- metadata_map.insert("created_at".to_string(), serde_json::Value::String(created));
220
+ metadata_map.insert(Cow::Borrowed("created_at"), serde_json::Value::String(created));
219
221
  }
220
222
  if let Some(modified) = core.modified {
221
- metadata_map.insert("modified_at".to_string(), serde_json::Value::String(modified));
223
+ metadata_map.insert(Cow::Borrowed("modified_at"), serde_json::Value::String(modified));
222
224
  }
223
225
  if let Some(revision) = core.revision {
224
- metadata_map.insert("revision".to_string(), serde_json::Value::String(revision));
226
+ metadata_map.insert(Cow::Borrowed("revision"), serde_json::Value::String(revision));
225
227
  }
226
228
  if let Some(category) = core.category {
227
- metadata_map.insert("category".to_string(), serde_json::Value::String(category));
229
+ metadata_map.insert(Cow::Borrowed("category"), serde_json::Value::String(category));
228
230
  }
229
231
  if let Some(content_status) = core.content_status {
230
- metadata_map.insert("content_status".to_string(), serde_json::Value::String(content_status));
232
+ metadata_map.insert(
233
+ Cow::Borrowed("content_status"),
234
+ serde_json::Value::String(content_status),
235
+ );
231
236
  }
232
237
  if let Some(language) = core.language {
233
- metadata_map.insert("language".to_string(), serde_json::Value::String(language));
238
+ metadata_map.insert(Cow::Borrowed("language"), serde_json::Value::String(language));
234
239
  }
235
240
  }
236
241
 
237
242
  if let Ok(app) = office_metadata::extract_docx_app_properties(&mut archive) {
238
243
  if let Some(pages) = app.pages {
239
- metadata_map.insert("page_count".to_string(), serde_json::Value::Number(pages.into()));
244
+ metadata_map.insert(Cow::Borrowed("page_count"), serde_json::Value::Number(pages.into()));
240
245
  }
241
246
  if let Some(words) = app.words {
242
- metadata_map.insert("word_count".to_string(), serde_json::Value::Number(words.into()));
247
+ metadata_map.insert(Cow::Borrowed("word_count"), serde_json::Value::Number(words.into()));
243
248
  }
244
249
  if let Some(chars) = app.characters {
245
- metadata_map.insert("character_count".to_string(), serde_json::Value::Number(chars.into()));
250
+ metadata_map.insert(
251
+ Cow::Borrowed("character_count"),
252
+ serde_json::Value::Number(chars.into()),
253
+ );
246
254
  }
247
255
  if let Some(lines) = app.lines {
248
- metadata_map.insert("line_count".to_string(), serde_json::Value::Number(lines.into()));
256
+ metadata_map.insert(Cow::Borrowed("line_count"), serde_json::Value::Number(lines.into()));
249
257
  }
250
258
  if let Some(paragraphs) = app.paragraphs {
251
259
  metadata_map.insert(
252
- "paragraph_count".to_string(),
260
+ Cow::Borrowed("paragraph_count"),
253
261
  serde_json::Value::Number(paragraphs.into()),
254
262
  );
255
263
  }
256
264
  if let Some(template) = app.template {
257
- metadata_map.insert("template".to_string(), serde_json::Value::String(template));
265
+ metadata_map.insert(Cow::Borrowed("template"), serde_json::Value::String(template));
258
266
  }
259
267
  if let Some(company) = app.company {
260
- metadata_map.insert("organization".to_string(), serde_json::Value::String(company));
268
+ metadata_map.insert(Cow::Borrowed("company"), serde_json::Value::String(company));
261
269
  }
262
270
  if let Some(time) = app.total_time {
263
271
  metadata_map.insert(
264
- "total_editing_time_minutes".to_string(),
272
+ Cow::Borrowed("total_editing_time_minutes"),
265
273
  serde_json::Value::Number(time.into()),
266
274
  );
267
275
  }
268
276
  if let Some(application) = app.application {
269
- metadata_map.insert("application".to_string(), serde_json::Value::String(application));
277
+ metadata_map.insert(Cow::Borrowed("application"), serde_json::Value::String(application));
270
278
  }
271
279
  }
272
280
 
273
281
  if let Ok(custom) = office_metadata::extract_custom_properties(&mut archive) {
274
282
  for (key, value) in custom {
275
- metadata_map.insert(format!("custom_{}", key), value);
283
+ metadata_map.insert(Cow::Owned(format!("custom_{}", key)), value);
276
284
  }
277
285
  }
278
286
 
@@ -301,7 +309,7 @@ impl DocumentExtractor for DocxExtractor {
301
309
 
302
310
  Ok(ExtractionResult {
303
311
  content: text,
304
- mime_type: mime_type.to_string(),
312
+ mime_type: mime_type.to_string().into(),
305
313
  metadata: Metadata {
306
314
  pages: page_structure,
307
315
  keywords: parsed_keywords,