kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -4,6 +4,7 @@ use super::*;
4
4
  use crate::core::config::OutputFormat;
5
5
  use crate::types::Metadata;
6
6
  use lazy_static::lazy_static;
7
+ use std::borrow::Cow;
7
8
 
8
9
  const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
9
10
  #[cfg(feature = "quality")]
@@ -19,7 +20,7 @@ lazy_static! {
19
20
  async fn test_run_pipeline_basic() {
20
21
  let mut result = ExtractionResult {
21
22
  content: "test".to_string(),
22
- mime_type: "text/plain".to_string(),
23
+ mime_type: Cow::Borrowed("text/plain"),
23
24
  metadata: Metadata::default(),
24
25
  tables: vec![],
25
26
  detected_languages: None,
@@ -30,7 +31,7 @@ async fn test_run_pipeline_basic() {
30
31
  elements: None,
31
32
  };
32
33
  result.metadata.additional.insert(
33
- VALIDATION_MARKER_KEY.to_string(),
34
+ Cow::Borrowed(VALIDATION_MARKER_KEY),
34
35
  serde_json::json!(ORDER_VALIDATION_MARKER),
35
36
  );
36
37
  let config = ExtractionConfig::default();
@@ -44,7 +45,7 @@ async fn test_run_pipeline_basic() {
44
45
  async fn test_pipeline_with_quality_processing() {
45
46
  let result = ExtractionResult {
46
47
  content: "This is a test document with some meaningful content.".to_string(),
47
- mime_type: "text/plain".to_string(),
48
+ mime_type: Cow::Borrowed("text/plain"),
48
49
  metadata: Metadata::default(),
49
50
  tables: vec![],
50
51
  detected_languages: None,
@@ -67,7 +68,7 @@ async fn test_pipeline_with_quality_processing() {
67
68
  async fn test_pipeline_without_quality_processing() {
68
69
  let result = ExtractionResult {
69
70
  content: "test".to_string(),
70
- mime_type: "text/plain".to_string(),
71
+ mime_type: Cow::Borrowed("text/plain"),
71
72
  metadata: Metadata::default(),
72
73
  tables: vec![],
73
74
  detected_languages: None,
@@ -91,7 +92,7 @@ async fn test_pipeline_without_quality_processing() {
91
92
  async fn test_pipeline_with_chunking() {
92
93
  let result = ExtractionResult {
93
94
  content: "This is a long text that should be chunked. ".repeat(100),
94
- mime_type: "text/plain".to_string(),
95
+ mime_type: Cow::Borrowed("text/plain"),
95
96
  metadata: Metadata::default(),
96
97
  tables: vec![],
97
98
  detected_languages: None,
@@ -103,8 +104,10 @@ async fn test_pipeline_with_chunking() {
103
104
  };
104
105
  let config = ExtractionConfig {
105
106
  chunking: Some(crate::ChunkingConfig {
106
- max_chars: 500,
107
- max_overlap: 50,
107
+ max_characters: 500,
108
+ overlap: 50,
109
+ trim: true,
110
+ chunker_type: crate::ChunkerType::Text,
108
111
  embedding: None,
109
112
  preset: None,
110
113
  }),
@@ -121,7 +124,7 @@ async fn test_pipeline_with_chunking() {
121
124
  async fn test_pipeline_without_chunking() {
122
125
  let result = ExtractionResult {
123
126
  content: "test".to_string(),
124
- mime_type: "text/plain".to_string(),
127
+ mime_type: Cow::Borrowed("text/plain"),
125
128
  metadata: Metadata::default(),
126
129
  tables: vec![],
127
130
  detected_languages: None,
@@ -142,14 +145,14 @@ async fn test_pipeline_without_chunking() {
142
145
 
143
146
  #[tokio::test]
144
147
  async fn test_pipeline_preserves_metadata() {
145
- use std::collections::HashMap;
146
- let mut additional = HashMap::new();
147
- additional.insert("source".to_string(), serde_json::json!("test"));
148
- additional.insert("page".to_string(), serde_json::json!(1));
148
+ use ahash::AHashMap;
149
+ let mut additional = AHashMap::new();
150
+ additional.insert(Cow::Borrowed("source"), serde_json::json!("test"));
151
+ additional.insert(Cow::Borrowed("page"), serde_json::json!(1));
149
152
 
150
153
  let result = ExtractionResult {
151
154
  content: "test".to_string(),
152
- mime_type: "text/plain".to_string(),
155
+ mime_type: Cow::Borrowed("text/plain"),
153
156
  metadata: Metadata {
154
157
  additional,
155
158
  ..Default::default()
@@ -187,7 +190,7 @@ async fn test_pipeline_preserves_tables() {
187
190
 
188
191
  let result = ExtractionResult {
189
192
  content: "test".to_string(),
190
- mime_type: "text/plain".to_string(),
193
+ mime_type: Cow::Borrowed("text/plain"),
191
194
  metadata: Metadata::default(),
192
195
  tables: vec![table],
193
196
  detected_languages: None,
@@ -219,7 +222,7 @@ async fn test_pipeline_empty_content() {
219
222
 
220
223
  let result = ExtractionResult {
221
224
  content: String::new(),
222
- mime_type: "text/plain".to_string(),
225
+ mime_type: Cow::Borrowed("text/plain"),
223
226
  metadata: Metadata::default(),
224
227
  tables: vec![],
225
228
  detected_languages: None,
@@ -242,7 +245,7 @@ async fn test_pipeline_empty_content() {
242
245
  async fn test_pipeline_with_all_features() {
243
246
  let result = ExtractionResult {
244
247
  content: "This is a comprehensive test document. ".repeat(50),
245
- mime_type: "text/plain".to_string(),
248
+ mime_type: Cow::Borrowed("text/plain"),
246
249
  metadata: Metadata::default(),
247
250
  tables: vec![],
248
251
  detected_languages: None,
@@ -255,8 +258,10 @@ async fn test_pipeline_with_all_features() {
255
258
  let config = ExtractionConfig {
256
259
  enable_quality_processing: true,
257
260
  chunking: Some(crate::ChunkingConfig {
258
- max_chars: 500,
259
- max_overlap: 50,
261
+ max_characters: 500,
262
+ overlap: 50,
263
+ trim: true,
264
+ chunker_type: crate::ChunkerType::Text,
260
265
  embedding: None,
261
266
  preset: None,
262
267
  }),
@@ -295,7 +300,7 @@ machine learning that uses neural networks with multiple layers.
295
300
  Natural language processing enables computers to understand human language.
296
301
  "#
297
302
  .to_string(),
298
- mime_type: "text/plain".to_string(),
303
+ mime_type: Cow::Borrowed("text/plain"),
299
304
  metadata: Metadata::default(),
300
305
  tables: vec![],
301
306
  detected_languages: None,
@@ -342,7 +347,7 @@ async fn test_pipeline_without_keyword_config() {
342
347
  }
343
348
  let result = ExtractionResult {
344
349
  content: "Machine learning and artificial intelligence.".to_string(),
345
- mime_type: "text/plain".to_string(),
350
+ mime_type: Cow::Borrowed("text/plain"),
346
351
  metadata: Metadata::default(),
347
352
  tables: vec![],
348
353
  detected_languages: None,
@@ -380,7 +385,7 @@ async fn test_pipeline_keyword_extraction_short_content() {
380
385
 
381
386
  let result = ExtractionResult {
382
387
  content: "Short text".to_string(),
383
- mime_type: "text/plain".to_string(),
388
+ mime_type: Cow::Borrowed("text/plain"),
384
389
  metadata: Metadata::default(),
385
390
  tables: vec![],
386
391
  detected_languages: None,
@@ -437,7 +442,7 @@ async fn test_postprocessor_runs_before_validator() {
437
442
  result
438
443
  .metadata
439
444
  .additional
440
- .insert("processed".to_string(), serde_json::json!(true));
445
+ .insert(Cow::Borrowed("processed"), serde_json::json!(true));
441
446
  Ok(())
442
447
  }
443
448
 
@@ -517,7 +522,7 @@ async fn test_postprocessor_runs_before_validator() {
517
522
 
518
523
  let mut result = ExtractionResult {
519
524
  content: "test".to_string(),
520
- mime_type: "text/plain".to_string(),
525
+ mime_type: Cow::Borrowed("text/plain"),
521
526
  metadata: Metadata::default(),
522
527
  tables: vec![],
523
528
  detected_languages: None,
@@ -528,7 +533,7 @@ async fn test_postprocessor_runs_before_validator() {
528
533
  elements: None,
529
534
  };
530
535
  result.metadata.additional.insert(
531
- VALIDATION_MARKER_KEY.to_string(),
536
+ Cow::Borrowed(VALIDATION_MARKER_KEY),
532
537
  serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
533
538
  );
534
539
 
@@ -614,7 +619,7 @@ async fn test_quality_processing_runs_before_validator() {
614
619
 
615
620
  let mut result = ExtractionResult {
616
621
  content: "This is meaningful test content for quality scoring.".to_string(),
617
- mime_type: "text/plain".to_string(),
622
+ mime_type: Cow::Borrowed("text/plain"),
618
623
  metadata: Metadata::default(),
619
624
  tables: vec![],
620
625
  detected_languages: None,
@@ -625,7 +630,7 @@ async fn test_quality_processing_runs_before_validator() {
625
630
  elements: None,
626
631
  };
627
632
  result.metadata.additional.insert(
628
- VALIDATION_MARKER_KEY.to_string(),
633
+ Cow::Borrowed(VALIDATION_MARKER_KEY),
629
634
  serde_json::json!(QUALITY_VALIDATION_MARKER),
630
635
  );
631
636
 
@@ -682,7 +687,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
682
687
  result
683
688
  .metadata
684
689
  .additional
685
- .insert("execution_order".to_string(), serde_json::json!(order));
690
+ .insert(Cow::Borrowed("execution_order"), serde_json::json!(order));
686
691
  Ok(())
687
692
  }
688
693
 
@@ -721,7 +726,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
721
726
  result
722
727
  .metadata
723
728
  .additional
724
- .insert("execution_order".to_string(), serde_json::json!(order));
729
+ .insert(Cow::Borrowed("execution_order"), serde_json::json!(order));
725
730
  Ok(())
726
731
  }
727
732
 
@@ -812,7 +817,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
812
817
 
813
818
  let result = ExtractionResult {
814
819
  content: "test".to_string(),
815
- mime_type: "text/plain".to_string(),
820
+ mime_type: Cow::Borrowed("text/plain"),
816
821
  metadata: Metadata::default(),
817
822
  tables: vec![],
818
823
  detected_languages: None,
@@ -839,7 +844,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
839
844
  async fn test_run_pipeline_with_output_format_plain() {
840
845
  let result = ExtractionResult {
841
846
  content: "test content".to_string(),
842
- mime_type: "text/plain".to_string(),
847
+ mime_type: Cow::Borrowed("text/plain"),
843
848
  metadata: Metadata::default(),
844
849
  tables: vec![],
845
850
  detected_languages: None,
@@ -865,7 +870,7 @@ async fn test_run_pipeline_with_output_format_djot() {
865
870
 
866
871
  let result = ExtractionResult {
867
872
  content: "test content".to_string(),
868
- mime_type: "text/djot".to_string(),
873
+ mime_type: Cow::Borrowed("text/djot"),
869
874
  metadata: Metadata::default(),
870
875
  tables: vec![],
871
876
  detected_languages: None,
@@ -894,7 +899,7 @@ async fn test_run_pipeline_with_output_format_djot() {
894
899
  images: vec![],
895
900
  links: vec![],
896
901
  footnotes: vec![],
897
- attributes: std::collections::HashMap::new(),
902
+ attributes: Vec::new(),
898
903
  }),
899
904
  };
900
905
 
@@ -912,7 +917,7 @@ async fn test_run_pipeline_with_output_format_djot() {
912
917
  async fn test_run_pipeline_with_output_format_html() {
913
918
  let result = ExtractionResult {
914
919
  content: "test content".to_string(),
915
- mime_type: "text/plain".to_string(),
920
+ mime_type: Cow::Borrowed("text/plain"),
916
921
  metadata: Metadata::default(),
917
922
  tables: vec![],
918
923
  detected_languages: None,
@@ -942,7 +947,7 @@ async fn test_run_pipeline_applies_output_format_last() {
942
947
 
943
948
  let result = ExtractionResult {
944
949
  content: "test".to_string(),
945
- mime_type: "text/plain".to_string(),
950
+ mime_type: Cow::Borrowed("text/plain"),
946
951
  metadata: Metadata::default(),
947
952
  tables: vec![],
948
953
  detected_languages: None,
@@ -958,7 +963,7 @@ async fn test_run_pipeline_applies_output_format_last() {
958
963
  images: vec![],
959
964
  links: vec![],
960
965
  footnotes: vec![],
961
- attributes: std::collections::HashMap::new(),
966
+ attributes: Vec::new(),
962
967
  }),
963
968
  };
964
969
 
@@ -24,6 +24,8 @@
24
24
  //! # Ok(())
25
25
  //! # }
26
26
  //! ```
27
+ use bytes::Bytes;
28
+
27
29
  use crate::error::{KreuzbergError, Result};
28
30
  use crate::types::{EmailAttachment, EmailExtractionResult};
29
31
  use mail_parser::MimeHeaders;
@@ -101,7 +103,7 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
101
103
 
102
104
  let html_content = message.body_html(0).map(|s| s.to_string());
103
105
 
104
- let cleaned_text = if let Some(plain) = &plain_text {
106
+ let cleaned_text = if let Some(ref plain) = plain_text {
105
107
  plain.clone()
106
108
  } else if let Some(html) = &html_content {
107
109
  clean_html_content(html)
@@ -132,7 +134,7 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
132
134
  mime_type: Some(mime_type),
133
135
  size: Some(size),
134
136
  is_image,
135
- data: Some(data.to_vec()),
137
+ data: Some(Bytes::copy_from_slice(data)),
136
138
  });
137
139
  }
138
140
 
@@ -174,39 +176,49 @@ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
174
176
  let to_emails = outlook
175
177
  .to
176
178
  .iter()
177
- .map(|p| p.email.clone())
178
- .filter(|e| !e.is_empty())
179
+ .filter_map(|p| {
180
+ if p.email.is_empty() {
181
+ None
182
+ } else {
183
+ Some(p.email.clone())
184
+ }
185
+ })
179
186
  .collect::<Vec<String>>();
180
187
 
181
188
  let cc_emails = outlook
182
189
  .cc
183
190
  .iter()
184
- .map(|p| p.email.clone())
185
- .filter(|e| !e.is_empty())
191
+ .filter_map(|p| {
192
+ if p.email.is_empty() {
193
+ None
194
+ } else {
195
+ Some(p.email.clone())
196
+ }
197
+ })
186
198
  .collect::<Vec<String>>();
187
199
 
188
- let bcc_emails = if !outlook.bcc.is_empty() {
189
- vec![outlook.bcc.clone()]
190
- } else {
200
+ let bcc_emails = if outlook.bcc.is_empty() {
191
201
  vec![]
202
+ } else {
203
+ vec![outlook.bcc.clone()]
192
204
  };
193
205
 
194
- let date = if !outlook.headers.date.is_empty() {
195
- Some(outlook.headers.date.clone())
196
- } else {
206
+ let date = if outlook.headers.date.is_empty() {
197
207
  None
208
+ } else {
209
+ Some(outlook.headers.date.clone())
198
210
  };
199
211
 
200
- let message_id = if !outlook.headers.message_id.is_empty() {
201
- Some(outlook.headers.message_id.clone())
202
- } else {
212
+ let message_id = if outlook.headers.message_id.is_empty() {
203
213
  None
214
+ } else {
215
+ Some(outlook.headers.message_id.clone())
204
216
  };
205
217
 
206
- let plain_text = if !outlook.body.is_empty() {
207
- Some(outlook.body.clone())
208
- } else {
218
+ let plain_text = if outlook.body.is_empty() {
209
219
  None
220
+ } else {
221
+ Some(outlook.body.clone())
210
222
  };
211
223
 
212
224
  let html_content = None;
@@ -231,7 +243,7 @@ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
231
243
  };
232
244
 
233
245
  let data = if !att.payload.is_empty() {
234
- hex::decode(&att.payload).ok()
246
+ hex::decode(&att.payload).ok().map(Bytes::from)
235
247
  } else {
236
248
  None
237
249
  };
@@ -448,13 +448,13 @@ fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity:
448
448
  markdown.push_str(" | ");
449
449
  }
450
450
  let cell_str = format_cell_to_string(cell);
451
- header_cells.push(cell_str.clone());
452
451
 
453
452
  if cell_str.contains('|') || cell_str.contains('\\') {
454
453
  escape_markdown_into(&mut markdown, &cell_str);
455
454
  } else {
456
455
  markdown.push_str(&cell_str);
457
456
  }
457
+ header_cells.push(cell_str);
458
458
  }
459
459
  markdown.push_str(" |\n");
460
460
  cells.push(header_cells);
@@ -475,18 +475,19 @@ fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity:
475
475
  if i > 0 {
476
476
  markdown.push_str(" | ");
477
477
  }
478
- if let Some(cell) = row.get(i) {
478
+ let cell_str = if let Some(cell) = row.get(i) {
479
479
  let cell_str = format_cell_to_string(cell);
480
- row_cells.push(cell_str.clone());
481
480
 
482
481
  if cell_str.contains('|') || cell_str.contains('\\') {
483
482
  escape_markdown_into(&mut markdown, &cell_str);
484
483
  } else {
485
484
  markdown.push_str(&cell_str);
486
485
  }
486
+ cell_str
487
487
  } else {
488
- row_cells.push(String::new());
489
- }
488
+ String::new()
489
+ };
490
+ row_cells.push(cell_str);
490
491
  }
491
492
  markdown.push_str(" |\n");
492
493
  cells.push(row_cells);
@@ -1,5 +1,7 @@
1
1
  //! Image handling and conversion functionality for HTML extraction.
2
2
 
3
+ use bytes::Bytes;
4
+
3
5
  use super::types::ExtractedInlineImage;
4
6
  use html_to_markdown_rs::{InlineImage, InlineImageFormat};
5
7
 
@@ -49,13 +51,16 @@ pub fn inline_image_format_to_str(format: &InlineImageFormat) -> String {
49
51
  }
50
52
  }
51
53
 
54
+ // Note: This function returns String because ExtractedInlineImage.format is String (internal to HTML extraction).
55
+ // For external ExtractedImage, use detect_image_format from pptx which returns Cow<'static, str>.
56
+
52
57
  /// Convert a library InlineImage to an ExtractedInlineImage.
53
58
  ///
54
59
  /// Maps the library's image representation to the extraction API's format,
55
60
  /// converting the format enum to a string representation.
56
61
  pub fn inline_image_to_extracted(image: InlineImage) -> ExtractedInlineImage {
57
62
  ExtractedInlineImage {
58
- data: image.data,
63
+ data: Bytes::from(image.data),
59
64
  format: inline_image_format_to_str(&image.format),
60
65
  filename: image.filename,
61
66
  description: image.description,
@@ -1,7 +1,7 @@
1
1
  //! Type definitions for HTML extraction.
2
2
 
3
+ use bytes::Bytes;
3
4
  use serde::{Deserialize, Serialize};
4
- use std::collections::HashMap;
5
5
 
6
6
  pub use html_to_markdown_rs::{
7
7
  CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
@@ -19,10 +19,11 @@ pub struct HtmlExtractionResult {
19
19
  /// Extracted inline image with metadata.
20
20
  #[derive(Debug, Clone, Serialize, Deserialize)]
21
21
  pub struct ExtractedInlineImage {
22
- pub data: Vec<u8>,
22
+ /// Uses `bytes::Bytes` for cheap cloning of large buffers.
23
+ pub data: Bytes,
23
24
  pub format: String,
24
25
  pub filename: Option<String>,
25
26
  pub description: Option<String>,
26
27
  pub dimensions: Option<(u32, u32)>,
27
- pub attributes: HashMap<String, String>,
28
+ pub attributes: Vec<(String, String)>,
28
29
  }
@@ -45,6 +45,7 @@
45
45
 
46
46
  use crate::error::{KreuzbergError, Result};
47
47
  use crate::types::LibreOfficeConversionResult;
48
+ use std::borrow::Cow;
48
49
  use std::collections::HashSet;
49
50
  use std::env;
50
51
  use std::fs as std_fs;
@@ -326,9 +327,9 @@ pub async fn convert_doc_to_docx(doc_bytes: &[u8]) -> Result<LibreOfficeConversi
326
327
 
327
328
  Ok(LibreOfficeConversionResult {
328
329
  converted_bytes,
329
- original_format: "doc".to_string(),
330
- target_format: "docx".to_string(),
331
- target_mime: crate::core::mime::DOCX_MIME_TYPE.to_string(),
330
+ original_format: Cow::Borrowed("doc"),
331
+ target_format: Cow::Borrowed("docx"),
332
+ target_mime: Cow::Borrowed(crate::core::mime::DOCX_MIME_TYPE),
332
333
  })
333
334
  }
334
335
 
@@ -350,9 +351,9 @@ pub async fn convert_ppt_to_pptx(ppt_bytes: &[u8]) -> Result<LibreOfficeConversi
350
351
 
351
352
  Ok(LibreOfficeConversionResult {
352
353
  converted_bytes,
353
- original_format: "ppt".to_string(),
354
- target_format: "pptx".to_string(),
355
- target_mime: crate::core::mime::POWER_POINT_MIME_TYPE.to_string(),
354
+ original_format: Cow::Borrowed("ppt"),
355
+ target_format: Cow::Borrowed("pptx"),
356
+ target_mime: Cow::Borrowed(crate::core::mime::POWER_POINT_MIME_TYPE),
356
357
  })
357
358
  }
358
359
 
@@ -505,9 +506,9 @@ mod tests {
505
506
  async fn test_conversion_result_structure() {
506
507
  let result = LibreOfficeConversionResult {
507
508
  converted_bytes: vec![1, 2, 3],
508
- original_format: "doc".to_string(),
509
- target_format: "docx".to_string(),
510
- target_mime: crate::core::mime::DOCX_MIME_TYPE.to_string(),
509
+ original_format: Cow::Borrowed("doc"),
510
+ target_format: Cow::Borrowed("docx"),
511
+ target_mime: Cow::Borrowed(crate::core::mime::DOCX_MIME_TYPE),
511
512
  };
512
513
 
513
514
  assert_eq!(result.original_format, "doc");
@@ -3,6 +3,8 @@
3
3
  //! This module handles image-related parsing from slide XML and
4
4
  //! detection of image formats from file data.
5
5
 
6
+ use std::borrow::Cow;
7
+
6
8
  pub(super) fn html_escape(text: &str) -> String {
7
9
  text.replace('&', "&amp;")
8
10
  .replace('<', "&lt;")
@@ -11,21 +13,21 @@ pub(super) fn html_escape(text: &str) -> String {
11
13
  .replace('\'', "&#x27;")
12
14
  }
13
15
 
14
- pub(super) fn detect_image_format(data: &[u8]) -> String {
16
+ pub(super) fn detect_image_format(data: &[u8]) -> Cow<'static, str> {
15
17
  if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
16
- "jpeg".to_string()
18
+ Cow::Borrowed("jpeg")
17
19
  } else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
18
- "png".to_string()
20
+ Cow::Borrowed("png")
19
21
  } else if data.starts_with(b"GIF") {
20
- "gif".to_string()
22
+ Cow::Borrowed("gif")
21
23
  } else if data.starts_with(b"BM") {
22
- "bmp".to_string()
24
+ Cow::Borrowed("bmp")
23
25
  } else if data.starts_with(b"<svg") || data.starts_with(b"<?xml") {
24
- "svg".to_string()
26
+ Cow::Borrowed("svg")
25
27
  } else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
26
- "tiff".to_string()
28
+ Cow::Borrowed("tiff")
27
29
  } else {
28
- "unknown".to_string()
30
+ Cow::Borrowed("unknown")
29
31
  }
30
32
  }
31
33
 
@@ -45,6 +45,8 @@ mod image_handling;
45
45
  mod metadata;
46
46
  mod parser;
47
47
 
48
+ use bytes::Bytes;
49
+
48
50
  use crate::error::Result;
49
51
  use crate::types::{ExtractedImage, PptxExtractionResult};
50
52
 
@@ -117,8 +119,8 @@ pub fn extract_pptx_from_path(
117
119
  let image_index = extracted_images.len();
118
120
 
119
121
  extracted_images.push(ExtractedImage {
120
- data,
121
- format,
122
+ data: Bytes::from(data),
123
+ format, // Already a Cow<'static, str> from detect_image_format
122
124
  image_index,
123
125
  page_number: Some(slide.slide_number as usize),
124
126
  width: None,
@@ -333,11 +335,13 @@ mod tests {
333
335
  <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
334
336
  );
335
337
  for (i, _) in slides.iter().enumerate() {
336
- rels_xml.push_str(&format!(
338
+ use std::fmt::Write;
339
+ let _ = write!(
340
+ rels_xml,
337
341
  r#"<Relationship Id="rId{}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{}.xml"/>"#,
338
342
  i + 1,
339
343
  i + 1
340
- ));
344
+ );
341
345
  }
342
346
  rels_xml.push_str("</Relationships>");
343
347
  zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
@@ -33,12 +33,13 @@
33
33
  use crate::error::{KreuzbergError, Result};
34
34
  use crate::text::utf8_validation;
35
35
  use serde::{Deserialize, Serialize};
36
+ use std::borrow::Cow;
36
37
  use std::collections::HashMap;
37
38
 
38
39
  #[derive(Debug, Clone, Serialize, Deserialize)]
39
40
  pub struct StructuredDataResult {
40
41
  pub content: String,
41
- pub format: String,
42
+ pub format: Cow<'static, str>,
42
43
  pub metadata: HashMap<String, String>,
43
44
  pub text_fields: Vec<String>,
44
45
  }
@@ -97,7 +98,7 @@ pub fn parse_json(data: &[u8], config: Option<JsonExtractionConfig>) -> Result<S
97
98
 
98
99
  Ok(StructuredDataResult {
99
100
  content,
100
- format: "json".to_string(),
101
+ format: Cow::Borrowed("json"),
101
102
  metadata,
102
103
  text_fields,
103
104
  })
@@ -254,7 +255,7 @@ pub fn parse_yaml(data: &[u8]) -> Result<StructuredDataResult> {
254
255
 
255
256
  Ok(StructuredDataResult {
256
257
  content,
257
- format: "yaml".to_string(),
258
+ format: Cow::Borrowed("yaml"),
258
259
  metadata,
259
260
  text_fields,
260
261
  })
@@ -326,7 +327,7 @@ pub fn parse_toml(data: &[u8]) -> Result<StructuredDataResult> {
326
327
 
327
328
  Ok(StructuredDataResult {
328
329
  content,
329
- format: "toml".to_string(),
330
+ format: Cow::Borrowed("toml"),
330
331
  metadata,
331
332
  text_fields,
332
333
  })
@@ -167,7 +167,7 @@ pub(super) fn process_images(
167
167
  element_index: Some(elements.len()),
168
168
  additional: {
169
169
  let mut m = HashMap::new();
170
- m.insert("format".to_string(), image.format.clone());
170
+ m.insert("format".to_string(), image.format.to_string());
171
171
  if let Some(width) = image.width {
172
172
  m.insert("width".to_string(), width.to_string());
173
173
  }