kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -147,10 +147,9 @@ async fn test_chunking_max_chars_limits_chunk_size() {
147
147
 
148
148
  let config = ExtractionConfig {
149
149
  chunking: Some(ChunkingConfig {
150
- max_chars: 100,
151
- max_overlap: 20,
152
- embedding: None,
153
- preset: None,
150
+ max_characters: 100,
151
+ overlap: 20,
152
+ ..Default::default()
154
153
  }),
155
154
  ..Default::default()
156
155
  };
@@ -184,10 +183,9 @@ async fn test_chunking_overlap_creates_overlap() {
184
183
 
185
184
  let config = ExtractionConfig {
186
185
  chunking: Some(ChunkingConfig {
187
- max_chars: 50,
188
- max_overlap: 15,
189
- embedding: None,
190
- preset: None,
186
+ max_characters: 50,
187
+ overlap: 15,
188
+ ..Default::default()
191
189
  }),
192
190
  ..Default::default()
193
191
  };
@@ -351,10 +349,9 @@ async fn test_chunking_overlap_maximum() {
351
349
 
352
350
  let config = ExtractionConfig {
353
351
  chunking: Some(ChunkingConfig {
354
- max_chars: 60,
355
- max_overlap: 10,
356
- embedding: None,
357
- preset: None,
352
+ max_characters: 60,
353
+ overlap: 10,
354
+ ..Default::default()
358
355
  }),
359
356
  ..Default::default()
360
357
  };
@@ -385,10 +382,9 @@ async fn test_large_document_with_combined_config() {
385
382
  let config = ExtractionConfig {
386
383
  output_format: OutputFormat::Plain,
387
384
  chunking: Some(ChunkingConfig {
388
- max_chars: 200,
389
- max_overlap: 30,
390
- embedding: None,
391
- preset: None,
385
+ max_characters: 200,
386
+ overlap: 30,
387
+ ..Default::default()
392
388
  }),
393
389
  use_cache: true,
394
390
  enable_quality_processing: true,
@@ -19,10 +19,12 @@ mod helpers;
19
19
  async fn test_chunking_enabled() {
20
20
  let config = ExtractionConfig {
21
21
  chunking: Some(ChunkingConfig {
22
- max_chars: 50,
23
- max_overlap: 10,
22
+ max_characters: 50,
23
+ overlap: 10,
24
24
  embedding: None,
25
25
  preset: None,
26
+ trim: true,
27
+ chunker_type: kreuzberg::chunking::ChunkerType::Text,
26
28
  }),
27
29
  ..Default::default()
28
30
  };
@@ -62,10 +64,12 @@ async fn test_chunking_enabled() {
62
64
  async fn test_chunking_with_overlap() {
63
65
  let config = ExtractionConfig {
64
66
  chunking: Some(ChunkingConfig {
65
- max_chars: 100,
66
- max_overlap: 20,
67
+ max_characters: 100,
68
+ overlap: 20,
67
69
  embedding: None,
68
70
  preset: None,
71
+ trim: true,
72
+ chunker_type: kreuzberg::chunking::ChunkerType::Text,
69
73
  }),
70
74
  ..Default::default()
71
75
  };
@@ -102,10 +106,12 @@ async fn test_chunking_with_overlap() {
102
106
  async fn test_chunking_custom_sizes() {
103
107
  let config = ExtractionConfig {
104
108
  chunking: Some(ChunkingConfig {
105
- max_chars: 200,
106
- max_overlap: 50,
109
+ max_characters: 200,
110
+ overlap: 50,
107
111
  embedding: None,
108
112
  preset: None,
113
+ trim: true,
114
+ chunker_type: kreuzberg::chunking::ChunkerType::Text,
109
115
  }),
110
116
  ..Default::default()
111
117
  };
@@ -512,10 +518,12 @@ async fn test_chunking_with_embeddings() {
512
518
 
513
519
  let config = ExtractionConfig {
514
520
  chunking: Some(ChunkingConfig {
515
- max_chars: 100,
516
- max_overlap: 20,
521
+ max_characters: 100,
522
+ overlap: 20,
517
523
  embedding: Some(EmbeddingConfig::default()),
518
524
  preset: None,
525
+ trim: true,
526
+ chunker_type: kreuzberg::chunking::ChunkerType::Text,
519
527
  }),
520
528
  ..Default::default()
521
529
  };
@@ -582,15 +590,15 @@ async fn test_chunking_with_fast_embeddings() {
582
590
 
583
591
  let config = ExtractionConfig {
584
592
  chunking: Some(ChunkingConfig {
585
- max_chars: 100,
586
- max_overlap: 20,
593
+ max_characters: 100,
594
+ overlap: 20,
587
595
  embedding: Some(EmbeddingConfig {
588
596
  model: EmbeddingModelType::Preset {
589
597
  name: "fast".to_string(),
590
598
  },
591
599
  ..Default::default()
592
600
  }),
593
- preset: None,
601
+ ..Default::default()
594
602
  }),
595
603
  ..Default::default()
596
604
  };
@@ -36,8 +36,8 @@ max_overlap = 100
36
36
  assert!(config.chunking.is_some(), "Should have chunking config");
37
37
 
38
38
  let chunking = config.chunking.expect("Operation failed");
39
- assert_eq!(chunking.max_chars, 1000);
40
- assert_eq!(chunking.max_overlap, 100);
39
+ assert_eq!(chunking.max_characters, 1000);
40
+ assert_eq!(chunking.overlap, 100);
41
41
  }
42
42
 
43
43
  /// Test loading config from YAML file.
@@ -51,8 +51,8 @@ ocr:
51
51
  enabled: true
52
52
  backend: tesseract
53
53
  chunking:
54
- max_chars: 1000
55
- max_overlap: 100
54
+ max_characters: 1000
55
+ overlap: 100
56
56
  "#;
57
57
 
58
58
  fs::write(&config_path, yaml_content).expect("Operation failed");
@@ -65,8 +65,8 @@ chunking:
65
65
  assert!(config.chunking.is_some(), "Should have chunking config");
66
66
 
67
67
  let chunking = config.chunking.expect("Operation failed");
68
- assert_eq!(chunking.max_chars, 1000);
69
- assert_eq!(chunking.max_overlap, 100);
68
+ assert_eq!(chunking.max_characters, 1000);
69
+ assert_eq!(chunking.overlap, 100);
70
70
  }
71
71
 
72
72
  /// Test loading config from JSON file.
@@ -98,8 +98,8 @@ fn test_from_file_json_succeeds() {
98
98
  assert!(config.chunking.is_some(), "Should have chunking config");
99
99
 
100
100
  let chunking = config.chunking.expect("Operation failed");
101
- assert_eq!(chunking.max_chars, 1000);
102
- assert_eq!(chunking.max_overlap, 100);
101
+ assert_eq!(chunking.max_characters, 1000);
102
+ assert_eq!(chunking.overlap, 100);
103
103
  }
104
104
 
105
105
  /// Test loading config from .yml extension.
@@ -420,6 +420,6 @@ max_overlap = -100
420
420
  if let Ok(config) = result
421
421
  && let Some(chunking) = config.chunking
422
422
  {
423
- assert!(chunking.max_chars > 0, "max_chars should be positive");
423
+ assert!(chunking.max_characters > 0, "max_characters should be positive");
424
424
  }
425
425
  }
@@ -96,8 +96,8 @@ fn test_mcp_chunking_config_nested_matches_rust_core() {
96
96
  assert!(config.chunking.is_some(), "Chunking config should be present");
97
97
 
98
98
  if let Some(chunking) = &config.chunking {
99
- assert_eq!(chunking.max_chars, 500, "max_chars should be 500");
100
- assert_eq!(chunking.max_overlap, 50, "max_overlap should be 50");
99
+ assert_eq!(chunking.max_characters, 500, "max_chars should be 500");
100
+ assert_eq!(chunking.overlap, 50, "max_overlap should be 50");
101
101
  }
102
102
 
103
103
  // Verify roundtrip
@@ -370,8 +370,8 @@ max_overlap = 300
370
370
  assert_eq!(ocr_config.language, "deu");
371
371
 
372
372
  let chunking_config = config.chunking.expect("Operation failed");
373
- assert_eq!(chunking_config.max_chars, 2000);
374
- assert_eq!(chunking_config.max_overlap, 300);
373
+ assert_eq!(chunking_config.max_characters, 2000);
374
+ assert_eq!(chunking_config.overlap, 300);
375
375
  }
376
376
 
377
377
  /// Test config discovery in parent directories.
@@ -481,10 +481,9 @@ async fn test_extraction_with_chunking_config() {
481
481
 
482
482
  let config = ExtractionConfig {
483
483
  chunking: Some(kreuzberg::ChunkingConfig {
484
- max_chars: 100,
485
- max_overlap: 20,
486
- embedding: None,
487
- preset: None,
484
+ max_characters: 100,
485
+ overlap: 20,
486
+ ..Default::default()
488
487
  }),
489
488
  ..Default::default()
490
489
  };
@@ -576,7 +576,7 @@ async fn test_opml_extraction_statistics() {
576
576
  println!(" Metadata fields: {}", result.metadata.additional.len());
577
577
 
578
578
  if !result.metadata.additional.is_empty() {
579
- let keys: Vec<String> = result.metadata.additional.keys().cloned().collect();
579
+ let keys: Vec<String> = result.metadata.additional.keys().map(|k| k.to_string()).collect();
580
580
  println!(" Keys: {}", keys.join(", "));
581
581
  }
582
582
 
@@ -778,7 +778,7 @@ async fn test_orgmode_extraction_statistics() {
778
778
  println!(" Metadata fields: {}", result.metadata.additional.len());
779
779
 
780
780
  if !result.metadata.additional.is_empty() {
781
- let keys: Vec<String> = result.metadata.additional.keys().cloned().collect();
781
+ let keys: Vec<String> = result.metadata.additional.keys().map(|k| k.to_string()).collect();
782
782
  println!(" Keys: {}", keys.join(", "));
783
783
  }
784
784
 
@@ -13,6 +13,7 @@ use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
13
13
  use kreuzberg::types::{ExtractionResult, Metadata};
14
14
  use kreuzberg::{KreuzbergError, Result};
15
15
  use serial_test::serial;
16
+ use std::borrow::Cow;
16
17
  use std::sync::Arc;
17
18
 
18
19
  struct OrderTrackingProcessor {
@@ -74,7 +75,7 @@ impl PostProcessor for MetadataAddingProcessor {
74
75
  result
75
76
  .metadata
76
77
  .additional
77
- .insert(self.key.clone(), serde_json::json!(self.value));
78
+ .insert(Cow::Owned(self.key.clone()), serde_json::json!(self.value));
78
79
  Ok(())
79
80
  }
80
81
 
@@ -134,7 +135,7 @@ async fn test_pipeline_empty_no_processors() {
134
135
 
135
136
  let result = ExtractionResult {
136
137
  content: "original content".to_string(),
137
- mime_type: "text/plain".to_string(),
138
+ mime_type: Cow::Borrowed("text/plain"),
138
139
  metadata: Metadata::default(),
139
140
  tables: vec![],
140
141
  detected_languages: None,
@@ -181,7 +182,7 @@ async fn test_pipeline_single_processor_per_stage() {
181
182
 
182
183
  let result = ExtractionResult {
183
184
  content: "start".to_string(),
184
- mime_type: "text/plain".to_string(),
185
+ mime_type: Cow::Borrowed("text/plain"),
185
186
  metadata: Metadata::default(),
186
187
  tables: vec![],
187
188
  detected_languages: None,
@@ -228,7 +229,7 @@ async fn test_pipeline_multiple_processors_per_stage() {
228
229
 
229
230
  let result = ExtractionResult {
230
231
  content: "start".to_string(),
231
- mime_type: "text/plain".to_string(),
232
+ mime_type: Cow::Borrowed("text/plain"),
232
233
  metadata: Metadata::default(),
233
234
  tables: vec![],
234
235
  detected_languages: None,
@@ -266,7 +267,7 @@ async fn test_pipeline_all_stages_enabled() {
266
267
 
267
268
  let result = ExtractionResult {
268
269
  content: "start".to_string(),
269
- mime_type: "text/plain".to_string(),
270
+ mime_type: Cow::Borrowed("text/plain"),
270
271
  metadata: Metadata::default(),
271
272
  tables: vec![],
272
273
  detected_languages: None,
@@ -302,7 +303,7 @@ async fn test_pipeline_postprocessing_disabled() {
302
303
 
303
304
  let result = ExtractionResult {
304
305
  content: "start".to_string(),
305
- mime_type: "text/plain".to_string(),
306
+ mime_type: Cow::Borrowed("text/plain"),
306
307
  metadata: Metadata::default(),
307
308
  tables: vec![],
308
309
  detected_languages: None,
@@ -353,7 +354,7 @@ async fn test_pipeline_early_stage_runs_first() {
353
354
 
354
355
  let result = ExtractionResult {
355
356
  content: "start".to_string(),
356
- mime_type: "text/plain".to_string(),
357
+ mime_type: Cow::Borrowed("text/plain"),
357
358
  metadata: Metadata::default(),
358
359
  tables: vec![],
359
360
  detected_languages: None,
@@ -395,7 +396,7 @@ async fn test_pipeline_middle_stage_runs_second() {
395
396
 
396
397
  let result = ExtractionResult {
397
398
  content: "start".to_string(),
398
- mime_type: "text/plain".to_string(),
399
+ mime_type: Cow::Borrowed("text/plain"),
399
400
  metadata: Metadata::default(),
400
401
  tables: vec![],
401
402
  detected_languages: None,
@@ -433,7 +434,7 @@ async fn test_pipeline_late_stage_runs_last() {
433
434
 
434
435
  let result = ExtractionResult {
435
436
  content: "start".to_string(),
436
- mime_type: "text/plain".to_string(),
437
+ mime_type: Cow::Borrowed("text/plain"),
437
438
  metadata: Metadata::default(),
438
439
  tables: vec![],
439
440
  detected_languages: None,
@@ -471,7 +472,7 @@ async fn test_pipeline_within_stage_priority_order() {
471
472
 
472
473
  let result = ExtractionResult {
473
474
  content: "start".to_string(),
474
- mime_type: "text/plain".to_string(),
475
+ mime_type: Cow::Borrowed("text/plain"),
475
476
  metadata: Metadata::default(),
476
477
  tables: vec![],
477
478
  detected_languages: None,
@@ -522,7 +523,7 @@ async fn test_pipeline_cross_stage_data_flow() {
522
523
  #[async_trait]
523
524
  impl PostProcessor for MiddleProcessor {
524
525
  async fn process(&self, result: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
525
- if let Some(stage) = result.metadata.additional.get("stage") {
526
+ if let Some(stage) = result.metadata.additional.get(&Cow::Borrowed("stage")) {
526
527
  result.content.push_str(&format!(
527
528
  "[saw:{}]",
528
529
  stage.as_str().expect("Failed to extract string from value")
@@ -541,7 +542,7 @@ async fn test_pipeline_cross_stage_data_flow() {
541
542
 
542
543
  let result = ExtractionResult {
543
544
  content: "start".to_string(),
544
- mime_type: "text/plain".to_string(),
545
+ mime_type: Cow::Borrowed("text/plain"),
545
546
  metadata: Metadata::default(),
546
547
  tables: vec![],
547
548
  detected_languages: None,
@@ -601,7 +602,7 @@ async fn test_pipeline_early_stage_error_recorded() {
601
602
 
602
603
  let result = ExtractionResult {
603
604
  content: "content".to_string(),
604
- mime_type: "text/plain".to_string(),
605
+ mime_type: Cow::Borrowed("text/plain"),
605
606
  metadata: Metadata::default(),
606
607
  tables: vec![],
607
608
  detected_languages: None,
@@ -645,7 +646,7 @@ async fn test_pipeline_middle_stage_error_propagation() {
645
646
 
646
647
  let result = ExtractionResult {
647
648
  content: "content".to_string(),
648
- mime_type: "text/plain".to_string(),
649
+ mime_type: Cow::Borrowed("text/plain"),
649
650
  metadata: Metadata::default(),
650
651
  tables: vec![],
651
652
  detected_languages: None,
@@ -719,7 +720,7 @@ async fn test_pipeline_late_stage_error_doesnt_affect_earlier_stages() {
719
720
 
720
721
  let result = ExtractionResult {
721
722
  content: "start".to_string(),
722
- mime_type: "text/plain".to_string(),
723
+ mime_type: Cow::Borrowed("text/plain"),
723
724
  metadata: Metadata::default(),
724
725
  tables: vec![],
725
726
  detected_languages: None,
@@ -809,7 +810,7 @@ async fn test_pipeline_processor_error_doesnt_stop_other_processors() {
809
810
 
810
811
  let result = ExtractionResult {
811
812
  content: "start".to_string(),
812
- mime_type: "text/plain".to_string(),
813
+ mime_type: Cow::Borrowed("text/plain"),
813
814
  metadata: Metadata::default(),
814
815
  tables: vec![],
815
816
  detected_languages: None,
@@ -889,7 +890,7 @@ async fn test_pipeline_multiple_processor_errors() {
889
890
 
890
891
  let result = ExtractionResult {
891
892
  content: "start".to_string(),
892
- mime_type: "text/plain".to_string(),
893
+ mime_type: Cow::Borrowed("text/plain"),
893
894
  metadata: Metadata::default(),
894
895
  tables: vec![],
895
896
  detected_languages: None,
@@ -933,7 +934,7 @@ async fn test_pipeline_error_context_preservation() {
933
934
 
934
935
  let result = ExtractionResult {
935
936
  content: "content".to_string(),
936
- mime_type: "text/plain".to_string(),
937
+ mime_type: Cow::Borrowed("text/plain"),
937
938
  metadata: Metadata::default(),
938
939
  tables: vec![],
939
940
  detected_languages: None,
@@ -991,8 +992,11 @@ async fn test_pipeline_metadata_added_in_early_visible_in_middle() {
991
992
  #[async_trait]
992
993
  impl PostProcessor for MiddleReadingProcessor {
993
994
  async fn process(&self, result: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
994
- if let Some(val) = result.metadata.additional.get("early_key") {
995
- result.metadata.additional.insert("middle_saw".to_string(), val.clone());
995
+ if let Some(val) = result.metadata.additional.get(&Cow::Borrowed("early_key")) {
996
+ result
997
+ .metadata
998
+ .additional
999
+ .insert(Cow::Borrowed("middle_saw"), val.clone());
996
1000
  }
997
1001
  Ok(())
998
1002
  }
@@ -1008,7 +1012,7 @@ async fn test_pipeline_metadata_added_in_early_visible_in_middle() {
1008
1012
 
1009
1013
  let result = ExtractionResult {
1010
1014
  content: "content".to_string(),
1011
- mime_type: "text/plain".to_string(),
1015
+ mime_type: Cow::Borrowed("text/plain"),
1012
1016
  metadata: Metadata::default(),
1013
1017
  tables: vec![],
1014
1018
  detected_languages: None,
@@ -1082,7 +1086,7 @@ async fn test_pipeline_content_modified_in_middle_visible_in_late() {
1082
1086
 
1083
1087
  let result = ExtractionResult {
1084
1088
  content: "start".to_string(),
1085
- mime_type: "text/plain".to_string(),
1089
+ mime_type: Cow::Borrowed("text/plain"),
1086
1090
  metadata: Metadata::default(),
1087
1091
  tables: vec![],
1088
1092
  detected_languages: None,
@@ -1135,7 +1139,7 @@ async fn test_pipeline_multiple_processors_modifying_same_metadata() {
1135
1139
  result
1136
1140
  .metadata
1137
1141
  .additional
1138
- .insert("shared_key".to_string(), serde_json::json!(self.value));
1142
+ .insert(Cow::Borrowed("shared_key"), serde_json::json!(self.value));
1139
1143
  Ok(())
1140
1144
  }
1141
1145
  fn processing_stage(&self) -> ProcessingStage {
@@ -1153,7 +1157,7 @@ async fn test_pipeline_multiple_processors_modifying_same_metadata() {
1153
1157
 
1154
1158
  let result = ExtractionResult {
1155
1159
  content: "content".to_string(),
1156
- mime_type: "text/plain".to_string(),
1160
+ mime_type: Cow::Borrowed("text/plain"),
1157
1161
  metadata: Metadata::default(),
1158
1162
  tables: vec![],
1159
1163
  detected_languages: None,
@@ -1213,13 +1217,13 @@ async fn test_pipeline_processors_reading_previous_output() {
1213
1217
  let current_count = result
1214
1218
  .metadata
1215
1219
  .additional
1216
- .get("count")
1220
+ .get(&Cow::Borrowed("count"))
1217
1221
  .and_then(|v| v.as_i64())
1218
1222
  .unwrap_or(0);
1219
1223
  result
1220
1224
  .metadata
1221
1225
  .additional
1222
- .insert("count".to_string(), serde_json::json!(current_count + 1));
1226
+ .insert(Cow::Borrowed("count"), serde_json::json!(current_count + 1));
1223
1227
  Ok(())
1224
1228
  }
1225
1229
  fn processing_stage(&self) -> ProcessingStage {
@@ -1243,7 +1247,7 @@ async fn test_pipeline_processors_reading_previous_output() {
1243
1247
 
1244
1248
  let result = ExtractionResult {
1245
1249
  content: "content".to_string(),
1246
- mime_type: "text/plain".to_string(),
1250
+ mime_type: Cow::Borrowed("text/plain"),
1247
1251
  metadata: Metadata::default(),
1248
1252
  tables: vec![],
1249
1253
  detected_languages: None,
@@ -1310,7 +1314,7 @@ async fn test_pipeline_large_content_modification() {
1310
1314
 
1311
1315
  let result = ExtractionResult {
1312
1316
  content: "start".to_string(),
1313
- mime_type: "text/plain".to_string(),
1317
+ mime_type: Cow::Borrowed("text/plain"),
1314
1318
  metadata: Metadata::default(),
1315
1319
  tables: vec![],
1316
1320
  detected_languages: None,
@@ -1348,7 +1352,7 @@ async fn test_pipeline_enabled_processors_whitelist() {
1348
1352
 
1349
1353
  let result = ExtractionResult {
1350
1354
  content: "start".to_string(),
1351
- mime_type: "text/plain".to_string(),
1355
+ mime_type: Cow::Borrowed("text/plain"),
1352
1356
  metadata: Metadata::default(),
1353
1357
  tables: vec![],
1354
1358
  detected_languages: None,
@@ -1397,7 +1401,7 @@ async fn test_pipeline_disabled_processors_blacklist() {
1397
1401
 
1398
1402
  let result = ExtractionResult {
1399
1403
  content: "start".to_string(),
1400
- mime_type: "text/plain".to_string(),
1404
+ mime_type: Cow::Borrowed("text/plain"),
1401
1405
  metadata: Metadata::default(),
1402
1406
  tables: vec![],
1403
1407
  detected_languages: None,
@@ -1446,7 +1450,7 @@ async fn test_pipeline_no_filtering_runs_all() {
1446
1450
 
1447
1451
  let result = ExtractionResult {
1448
1452
  content: "start".to_string(),
1449
- mime_type: "text/plain".to_string(),
1453
+ mime_type: Cow::Borrowed("text/plain"),
1450
1454
  metadata: Metadata::default(),
1451
1455
  tables: vec![],
1452
1456
  detected_languages: None,
@@ -1486,7 +1490,7 @@ async fn test_pipeline_empty_whitelist_runs_none() {
1486
1490
 
1487
1491
  let result = ExtractionResult {
1488
1492
  content: "start".to_string(),
1489
- mime_type: "text/plain".to_string(),
1493
+ mime_type: Cow::Borrowed("text/plain"),
1490
1494
  metadata: Metadata::default(),
1491
1495
  tables: vec![],
1492
1496
  detected_languages: None,
@@ -54,9 +54,10 @@ impl OcrBackend for MockOcrBackend {
54
54
  return Err(KreuzbergError::validation("Empty image data".to_string()));
55
55
  }
56
56
 
57
+ use std::borrow::Cow;
57
58
  Ok(ExtractionResult {
58
59
  content: format!("{} (lang: {})", self.return_text, config.language),
59
- mime_type: "text/plain".to_string(),
60
+ mime_type: Cow::Borrowed("text/plain"),
60
61
  metadata: Metadata::default(),
61
62
  tables: vec![],
62
63
  detected_languages: None,
@@ -152,9 +153,10 @@ impl OcrBackend for ValidatingOcrBackend {
152
153
  )));
153
154
  }
154
155
 
156
+ use std::borrow::Cow;
155
157
  Ok(ExtractionResult {
156
158
  content: format!("Processed {} bytes", image_bytes.len()),
157
- mime_type: "text/plain".to_string(),
159
+ mime_type: Cow::Borrowed("text/plain"),
158
160
  metadata: Metadata::default(),
159
161
  tables: vec![],
160
162
  detected_languages: None,
@@ -201,19 +203,23 @@ impl Plugin for MetadataOcrBackend {
201
203
  impl OcrBackend for MetadataOcrBackend {
202
204
  async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
203
205
  let mut metadata = Metadata::default();
204
- metadata
205
- .additional
206
- .insert("ocr_backend".to_string(), serde_json::json!(self.name()));
207
- metadata
208
- .additional
209
- .insert("image_size".to_string(), serde_json::json!(image_bytes.len()));
210
- metadata
211
- .additional
212
- .insert("ocr_language".to_string(), serde_json::json!(config.language));
213
-
206
+ metadata.additional.insert(
207
+ std::borrow::Cow::Borrowed("ocr_backend"),
208
+ serde_json::json!(self.name()),
209
+ );
210
+ metadata.additional.insert(
211
+ std::borrow::Cow::Borrowed("image_size"),
212
+ serde_json::json!(image_bytes.len()),
213
+ );
214
+ metadata.additional.insert(
215
+ std::borrow::Cow::Borrowed("ocr_language"),
216
+ serde_json::json!(config.language),
217
+ );
218
+
219
+ use std::borrow::Cow;
214
220
  Ok(ExtractionResult {
215
221
  content: "OCR processed text".to_string(),
216
- mime_type: "text/plain".to_string(),
222
+ mime_type: Cow::Borrowed("text/plain"),
217
223
  metadata,
218
224
  tables: vec![],
219
225
  detected_languages: None,
@@ -11,6 +11,7 @@ use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
11
11
  use kreuzberg::types::ExtractionResult;
12
12
  use kreuzberg::{KreuzbergError, Result, extract_file_sync};
13
13
  use serial_test::serial;
14
+ use std::borrow::Cow;
14
15
  use std::sync::Arc;
15
16
  use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
16
17
 
@@ -82,9 +83,9 @@ impl PostProcessor for MetadataAddingProcessor {
82
83
  result
83
84
  .metadata
84
85
  .additional
85
- .insert("processed_by".to_string(), serde_json::json!(self.name()));
86
+ .insert(Cow::Borrowed("processed_by"), serde_json::json!(self.name()));
86
87
  result.metadata.additional.insert(
87
- "word_count".to_string(),
88
+ Cow::Borrowed("word_count"),
88
89
  serde_json::json!(result.content.split_whitespace().count()),
89
90
  );
90
91
  Ok(())
@@ -11,6 +11,7 @@ use kreuzberg::plugins::registry::{
11
11
  use kreuzberg::plugins::{DocumentExtractor, Plugin, PostProcessor, ProcessingStage, Validator};
12
12
  use kreuzberg::types::{ExtractionResult, Metadata};
13
13
  use kreuzberg::{KreuzbergError, Result};
14
+ use std::borrow::Cow;
14
15
  use std::sync::Arc;
15
16
 
16
17
  struct FailingExtractor {
@@ -52,7 +53,7 @@ impl DocumentExtractor for FailingExtractor {
52
53
  } else {
53
54
  Ok(ExtractionResult {
54
55
  content: "success".to_string(),
55
- mime_type: "text/plain".to_string(),
56
+ mime_type: Cow::Borrowed("text/plain"),
56
57
  metadata: Metadata::default(),
57
58
  tables: vec![],
58
59
  detected_languages: None,
@@ -299,7 +300,7 @@ fn test_extractor_priority_ordering_complex() {
299
300
  async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
300
301
  Ok(ExtractionResult {
301
302
  content: "test".to_string(),
302
- mime_type: "text/plain".to_string(),
303
+ mime_type: Cow::Borrowed("text/plain"),
303
304
  metadata: Metadata::default(),
304
305
  tables: vec![],
305
306
  detected_languages: None,
@@ -461,7 +462,7 @@ async fn test_processor_execution_order_within_stage() {
461
462
 
462
463
  let mut result = ExtractionResult {
463
464
  content: "start".to_string(),
464
- mime_type: "text/plain".to_string(),
465
+ mime_type: Cow::Borrowed("text/plain"),
465
466
  metadata: Metadata::default(),
466
467
  tables: vec![],
467
468
  detected_languages: None,
@@ -498,7 +499,7 @@ async fn test_processor_error_propagation() {
498
499
 
499
500
  let mut result = ExtractionResult {
500
501
  content: "test".to_string(),
501
- mime_type: "text/plain".to_string(),
502
+ mime_type: Cow::Borrowed("text/plain"),
502
503
  metadata: Metadata::default(),
503
504
  tables: vec![],
504
505
  detected_languages: None,
@@ -672,7 +673,7 @@ async fn test_validator_content_validation() {
672
673
 
673
674
  let short_result = ExtractionResult {
674
675
  content: "short".to_string(),
675
- mime_type: "text/plain".to_string(),
676
+ mime_type: Cow::Borrowed("text/plain"),
676
677
  metadata: Metadata::default(),
677
678
  tables: vec![],
678
679
  detected_languages: None,
@@ -688,7 +689,7 @@ async fn test_validator_content_validation() {
688
689
 
689
690
  let long_result = ExtractionResult {
690
691
  content: "this is long enough content".to_string(),
691
- mime_type: "text/plain".to_string(),
692
+ mime_type: Cow::Borrowed("text/plain"),
692
693
  metadata: Metadata::default(),
693
694
  tables: vec![],
694
695
  detected_languages: None,
@@ -157,7 +157,7 @@ impl Plugin for MetadataValidator {
157
157
  #[async_trait]
158
158
  impl Validator for MetadataValidator {
159
159
  async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
160
- if !result.metadata.additional.contains_key(&self.required_key) {
160
+ if !result.metadata.additional.contains_key(self.required_key.as_str()) {
161
161
  Err(KreuzbergError::validation(format!(
162
162
  "Required metadata key '{}' missing",
163
163
  self.required_key