kreuzberg 4.9.4 → 4.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  4. data/ext/kreuzberg_rb/native/src/config/types.rs +1 -0
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +5 -5
  7. data/vendor/kreuzberg/Cargo.toml +3 -3
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/ocr.rs +8 -0
  10. data/vendor/kreuzberg/src/core/config/processing.rs +72 -14
  11. data/vendor/kreuzberg/src/core/extractor/bytes.rs +27 -3
  12. data/vendor/kreuzberg/src/core/extractor/file.rs +27 -3
  13. data/vendor/kreuzberg/src/core/pipeline/mod.rs +26 -20
  14. data/vendor/kreuzberg/src/doc_orientation.rs +1 -1
  15. data/vendor/kreuzberg/src/extraction/email.rs +72 -10
  16. data/vendor/kreuzberg/src/extraction/image.rs +2 -2
  17. data/vendor/kreuzberg/src/extraction/image_ocr.rs +6 -1
  18. data/vendor/kreuzberg/src/extraction/transform/content.rs +249 -4
  19. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -5
  20. data/vendor/kreuzberg/src/extractors/email.rs +12 -11
  21. data/vendor/kreuzberg/src/extractors/hwp.rs +18 -5
  22. data/vendor/kreuzberg/src/extractors/image.rs +11 -6
  23. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +16 -2
  24. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +46 -16
  25. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +26 -8
  26. data/vendor/kreuzberg/src/mcp/params.rs +17 -1
  27. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +1 -0
  28. data/vendor/kreuzberg/src/ocr/types.rs +11 -1
  29. data/vendor/kreuzberg/src/ort_discovery.rs +74 -22
  30. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +108 -10
  31. data/vendor/kreuzberg/src/pdf/images.rs +134 -8
  32. data/vendor/kreuzberg/src/pdf/structure/bridge.rs +4 -4
  33. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +7 -3
  34. data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +9 -0
  35. data/vendor/kreuzberg/src/rendering/djot.rs +8 -0
  36. data/vendor/kreuzberg/src/rendering/markdown.rs +7 -0
  37. data/vendor/kreuzberg/src/rendering/plain.rs +16 -7
  38. data/vendor/kreuzberg/src/types/formats.rs +6 -2
  39. data/vendor/kreuzberg/src/utils/image_decode.rs +99 -0
  40. data/vendor/kreuzberg/src/utils/mod.rs +8 -0
  41. data/vendor/kreuzberg/tests/docx_ocr_integration_test.rs +84 -0
  42. data/vendor/kreuzberg/tests/email_integration.rs +18 -7
  43. data/vendor/kreuzberg/tests/extraction_timeout_tests.rs +92 -0
  44. data/vendor/kreuzberg/tests/gpu_acceleration.rs +419 -0
  45. data/vendor/kreuzberg/tests/issue_797_preset_embedding_regression.rs +75 -0
  46. data/vendor/kreuzberg/tests/markdown_lint_quality.rs +18 -6
  47. data/vendor/kreuzberg/tests/mcp_integration.rs +13 -5
  48. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +2 -2
  49. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +16 -20
  50. data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +129 -0
  51. data/vendor/kreuzberg/tests/test_batch_extract_schema.rs +56 -0
  52. data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
  53. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  54. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  55. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  56. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  57. data/vendor/kreuzberg-tesseract/build.rs +5 -0
  58. metadata +8 -3
  59. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6921
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d8a203168595f6b316a165f500818abed75d89c7a82c46b5b20df996a4bb841
4
- data.tar.gz: 28fd19fecd9b18597f17a783923ec3ec08cfa7b99612fec1ca8790aa5cdddbdc
3
+ metadata.gz: 9f3132b44aad1652c76e8b1445b775eb3586e48661908eda794c95339f06387d
4
+ data.tar.gz: 2f957af07040ec2f3bcd79c299dd429a752423d714eea73bfb608a28718a6c11
5
5
  SHA512:
6
- metadata.gz: af522bff519c1082396d9a6a9480a088693791a4f50818fd1d233726082675559a3144f7758b0ee217b18cdbb1cd08236ecbb332f68c4186a26aa69b83454392
7
- data.tar.gz: 4109e6dbc32c5fed518ba84940a7bc732553a178d3b67b69dad6eff5b998aad12b0a53cfe6c6d0784848cbb484e7b79a7abe9154f23d2100786f38414d8286c0
6
+ metadata.gz: 878748ecb791e049c2de05cdc4ec7b9f6749bb265981c98ea49126108ca7c2782b92a6b5ed31d1fbfbeee83e3c45c80aaf74aacecd20f9bc428d796709afa0aa
7
+ data.tar.gz: ff137eb78f8fcfcc2ac357b0d9adf6d3d6fee11a448679a976678e0745905a0abcd8abfeb331028d780810d96dc47a04ec01dda94c900ec63ad4b35c124c187f
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.4" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.6" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.9.4"
3
+ version = "4.9.6"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
65
65
  "time",
66
66
  "io-util",
67
67
  ] }
68
- html-to-markdown-rs = { version = "3.2.6", default-features = false }
68
+ html-to-markdown-rs = { version = "3.3.1", default-features = false }
69
69
 
70
70
  [dev-dependencies]
71
71
  pretty_assertions = "1.4"
@@ -54,6 +54,7 @@ pub fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
54
54
  quality_thresholds: None,
55
55
  vlm_config: None,
56
56
  vlm_prompt: None,
57
+ acceleration: None,
57
58
  };
58
59
 
59
60
  if let Some(val) = get_kw(ruby, hash, "tesseract_config")
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.9.4'
4
+ VERSION = '4.9.6'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.9.4"
5
+ version = "4.9.6"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -28,12 +28,12 @@ dbase = "0.7"
28
28
  futures = "0.3"
29
29
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
30
30
  hex = "0.4.3"
31
- html-to-markdown-rs = { version = "3.2.6", default-features = false }
31
+ html-to-markdown-rs = { version = "3.3.1", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.9.4", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.4" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.9.6", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.6" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.185"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -57,7 +57,7 @@ thiserror = "2.0.18"
57
57
  tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
58
58
  toml = "1.1.2"
59
59
  tracing = "0.1"
60
- tree-sitter-language-pack = { version = "1.6.2", features = ["serde"], default-features = false }
60
+ tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false }
61
61
  wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
62
62
  wasm-bindgen-futures = "0.4"
63
63
  web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.9.4"
3
+ version = "4.9.6"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
271
271
  "simd",
272
272
  ], optional = true }
273
273
  hex = "0.4.3"
274
- html-to-markdown-rs = { version = "3.2.6", default-features = false, features = [
274
+ html-to-markdown-rs = { version = "3.3.1", default-features = false, features = [
275
275
  "inline-images",
276
276
  "metadata",
277
277
  ], optional = true }
@@ -392,7 +392,7 @@ optional = true
392
392
  # Override getrandom to enable js feature for WASM targets
393
393
  # This is needed because ring/rustls (via ureq) depend on getrandom without js feature
394
394
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
395
- tree-sitter-language-pack = { version = "1.6.2", features = ["serde"], default-features = false, optional = true }
395
+ tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false, optional = true }
396
396
  wasm-bindgen-rayon = { version = "1.3", optional = true }
397
397
 
398
398
  [build-dependencies]
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.9.4 Release**
21
+ > **🚀 Version 4.9.6 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -271,6 +271,13 @@ pub struct OcrConfig {
271
271
  /// - `{{ language }}` — The document language code (e.g., "eng", "deu").
272
272
  #[serde(default, skip_serializing_if = "Option::is_none")]
273
273
  pub vlm_prompt: Option<String>,
274
+
275
+ /// Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
276
+ ///
277
+ /// Not user-configurable via config files — injected at runtime from
278
+ /// `ExtractionConfig::acceleration` before each `process_image` call.
279
+ #[serde(skip)]
280
+ pub acceleration: Option<super::acceleration::AccelerationConfig>,
274
281
  }
275
282
 
276
283
  impl Default for OcrConfig {
@@ -288,6 +295,7 @@ impl Default for OcrConfig {
288
295
  auto_rotate: false,
289
296
  vlm_config: None,
290
297
  vlm_prompt: None,
298
+ acceleration: None,
291
299
  }
292
300
  }
293
301
  }
@@ -267,15 +267,10 @@ impl ChunkingConfig {
267
267
  }
268
268
  };
269
269
 
270
- let embedding = match &self.embedding {
271
- Some(existing) => Some(existing.clone()),
272
- None => Some(EmbeddingConfig {
273
- model: EmbeddingModelType::Preset {
274
- name: preset_name.clone(),
275
- },
276
- ..EmbeddingConfig::default()
277
- }),
278
- };
270
+ // Preserve the caller's embedding choice, including None.
271
+ // Presets configure chunking parameters only; users must explicitly
272
+ // provide an EmbeddingConfig to opt into embedding generation.
273
+ let embedding = self.embedding.clone();
279
274
 
280
275
  Self {
281
276
  max_characters: preset.chunk_size,
@@ -568,11 +563,9 @@ mod tests {
568
563
  let resolved = config.resolve_preset();
569
564
  assert_eq!(resolved.max_characters, 1024);
570
565
  assert_eq!(resolved.overlap, 100);
571
- assert!(resolved.embedding.is_some());
572
- match &resolved.embedding.unwrap().model {
573
- EmbeddingModelType::Preset { name } => assert_eq!(name, "balanced"),
574
- _ => panic!("Expected Preset model type"),
575
- }
566
+ // Preset configures chunking parameters only; embedding stays None unless
567
+ // the caller explicitly provided one (#797).
568
+ assert!(resolved.embedding.is_none());
576
569
  }
577
570
 
578
571
  #[test]
@@ -686,4 +679,69 @@ mod tests {
686
679
  _ => panic!("Expected Custom variant"),
687
680
  }
688
681
  }
682
+
683
+ // --- Issue #797 regression tests ---
684
+
685
+ /// Preset with no explicit embedding: embedding must remain None.
686
+ ///
687
+ /// Before the fix, `resolve_preset()` would silently inject an
688
+ /// `EmbeddingConfig` whenever a preset was configured, causing every
689
+ /// chunk to have an unexpected `.embedding` field populated.
690
+ #[test]
691
+ #[cfg(feature = "embeddings")]
692
+ fn test_resolve_preset_does_not_inject_embedding_when_none() {
693
+ let config = ChunkingConfig {
694
+ preset: Some("multilingual".to_string()),
695
+ embedding: None,
696
+ ..Default::default()
697
+ };
698
+ let resolved = config.resolve_preset();
699
+ assert!(
700
+ resolved.embedding.is_none(),
701
+ "preset alone must not inject an EmbeddingConfig (#797)"
702
+ );
703
+ }
704
+
705
+ /// Preset with an explicit embedding: the embedding must be preserved unchanged.
706
+ #[test]
707
+ #[cfg(feature = "embeddings")]
708
+ fn test_resolve_preset_preserves_explicit_embedding_config() {
709
+ let explicit = EmbeddingConfig {
710
+ model: EmbeddingModelType::Custom {
711
+ model_id: "my-org/model".to_string(),
712
+ dimensions: 768,
713
+ },
714
+ batch_size: 16,
715
+ ..Default::default()
716
+ };
717
+ let config = ChunkingConfig {
718
+ preset: Some("multilingual".to_string()),
719
+ embedding: Some(explicit),
720
+ ..Default::default()
721
+ };
722
+ let resolved = config.resolve_preset();
723
+ let emb = resolved
724
+ .embedding
725
+ .expect("explicit embedding must survive resolve_preset");
726
+ assert_eq!(emb.batch_size, 16);
727
+ match emb.model {
728
+ EmbeddingModelType::Custom { model_id, dimensions } => {
729
+ assert_eq!(model_id, "my-org/model");
730
+ assert_eq!(dimensions, 768);
731
+ }
732
+ other => panic!("expected Custom model type, got {other:?}"),
733
+ }
734
+ }
735
+
736
+ /// No preset, no embedding: embedding must stay None (regression guard).
737
+ #[test]
738
+ fn test_resolve_preset_no_preset_no_embedding_stays_none() {
739
+ let config = ChunkingConfig {
740
+ preset: None,
741
+ embedding: None,
742
+ ..Default::default()
743
+ };
744
+ let resolved = config.resolve_preset();
745
+ assert!(resolved.embedding.is_none(), "no-preset path must not touch embedding");
746
+ }
689
747
  }
@@ -66,7 +66,7 @@ use super::file::extract_bytes_with_extractor;
66
66
  pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
67
67
  use crate::core::mime;
68
68
 
69
- let result = async {
69
+ let extraction_future = async {
70
70
  if config.force_ocr && config.effective_disable_ocr() {
71
71
  return Err(crate::KreuzbergError::Validation {
72
72
  message: "force_ocr and disable_ocr cannot both be true".to_string(),
@@ -105,8 +105,32 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
105
105
  }
106
106
 
107
107
  extract_bytes_with_extractor(content, &validated_mime, config).await
108
- }
109
- .await;
108
+ };
109
+
110
+ #[cfg(feature = "tokio-runtime")]
111
+ let result = if let Some(secs) = config.extraction_timeout_secs {
112
+ let start = std::time::Instant::now();
113
+ match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
114
+ Ok(inner) => inner,
115
+ Err(_elapsed) => {
116
+ if let Some(ref token) = config.cancel_token {
117
+ token.cancel();
118
+ }
119
+ Err(crate::KreuzbergError::Timeout {
120
+ elapsed_ms: start.elapsed().as_millis() as u64,
121
+ limit_ms: secs * 1000,
122
+ })
123
+ }
124
+ }
125
+ } else {
126
+ extraction_future.await
127
+ };
128
+
129
+ #[cfg(not(feature = "tokio-runtime"))]
130
+ let result = {
131
+ let _ = config.extraction_timeout_secs;
132
+ extraction_future.await
133
+ };
110
134
 
111
135
  #[cfg(feature = "otel")]
112
136
  if let Err(ref e) = result {
@@ -82,7 +82,7 @@ pub async fn extract_file(
82
82
  );
83
83
  }
84
84
 
85
- let result = async {
85
+ let extraction_future = async {
86
86
  io::validate_file_exists(path)?;
87
87
 
88
88
  if config.force_ocr && config.effective_disable_ocr() {
@@ -119,8 +119,32 @@ pub async fn extract_file(
119
119
  }
120
120
 
121
121
  extract_file_with_extractor(path, &detected_mime, config).await
122
- }
123
- .await;
122
+ };
123
+
124
+ #[cfg(feature = "tokio-runtime")]
125
+ let result = if let Some(secs) = config.extraction_timeout_secs {
126
+ let start = std::time::Instant::now();
127
+ match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
128
+ Ok(inner) => inner,
129
+ Err(_elapsed) => {
130
+ if let Some(ref token) = config.cancel_token {
131
+ token.cancel();
132
+ }
133
+ Err(crate::KreuzbergError::Timeout {
134
+ elapsed_ms: start.elapsed().as_millis() as u64,
135
+ limit_ms: secs * 1000,
136
+ })
137
+ }
138
+ }
139
+ } else {
140
+ extraction_future.await
141
+ };
142
+
143
+ #[cfg(not(feature = "tokio-runtime"))]
144
+ let result = {
145
+ let _ = config.extraction_timeout_secs;
146
+ extraction_future.await
147
+ };
124
148
 
125
149
  #[cfg(feature = "otel")]
126
150
  if let Err(ref e) = result {
@@ -55,6 +55,8 @@ use initialization::{get_processors_from_cache, initialize_features, initialize_
55
55
  )
56
56
  ))]
57
57
  pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
58
+ #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
59
+ let mut doc = doc;
58
60
  // Pre-render markdown for the chunker's heading context resolution when:
59
61
  // - Markdown chunking is configured
60
62
  // - Output format is not already Markdown (which would produce formatted_content anyway)
@@ -100,7 +102,30 @@ pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> R
100
102
  }
101
103
  };
102
104
 
103
- // 1. Derive ExtractionResult from InternalDocument
105
+ // 1. Process extracted images with OCR if configured
106
+ #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
107
+ if config.ocr.is_some() && !doc.images.is_empty() {
108
+ let images_to_process = std::mem::take(&mut doc.images);
109
+ match crate::extraction::image_ocr::process_images_with_ocr(
110
+ images_to_process,
111
+ config,
112
+ &mut doc.processing_warnings,
113
+ )
114
+ .await
115
+ {
116
+ Ok(processed) => {
117
+ doc.images = processed;
118
+ }
119
+ Err(e) => {
120
+ doc.processing_warnings.push(crate::types::ProcessingWarning {
121
+ source: std::borrow::Cow::Borrowed("image_ocr"),
122
+ message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
123
+ });
124
+ }
125
+ }
126
+ }
127
+
128
+ // 2. Derive ExtractionResult from InternalDocument
104
129
  let include_structure = config.include_document_structure;
105
130
  let mut result =
106
131
  crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
@@ -111,25 +136,6 @@ pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> R
111
136
  result.formatted_content = Some(html);
112
137
  }
113
138
 
114
- // 1.5. Process extracted images with OCR if configured
115
- #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
116
- if config.ocr.is_some() && result.images.as_ref().is_some_and(|imgs| !imgs.is_empty()) {
117
- let images_to_process = result.images.take().unwrap_or_default();
118
- match crate::extraction::image_ocr::process_images_with_ocr(images_to_process, config).await {
119
- Ok(processed) => {
120
- result.images = if processed.is_empty() { None } else { Some(processed) };
121
- }
122
- Err(e) => {
123
- result
124
- .processing_warnings
125
- .push(crate::types::extraction::ProcessingWarning {
126
- source: std::borrow::Cow::Borrowed("image_ocr"),
127
- message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
128
- });
129
- }
130
- }
131
- }
132
-
133
139
  // Temporarily store pre-rendered markdown for chunker heading context.
134
140
  // Tracked separately so we can remove it after chunking — apply_output_format
135
141
  // must not swap this into result.content when output_format is Plain.
@@ -229,7 +229,7 @@ pub fn resolve_cache_dir() -> PathBuf {
229
229
  /// Returns `Ok(Some(rotated_bytes))` if rotation was applied,
230
230
  /// `Ok(None)` if no rotation needed (0° or low confidence).
231
231
  pub fn detect_and_rotate(detector: &DocOrientationDetector, image_bytes: &[u8]) -> Result<Option<Vec<u8>>> {
232
- let img = image::load_from_memory(image_bytes)
232
+ let img = crate::utils::image_decode::decode_with_pixel_cap(image_bytes)
233
233
  .map_err(|e| KreuzbergError::Ocr {
234
234
  message: format!("Failed to load image for orientation detection: {e}"),
235
235
  source: None,
@@ -256,6 +256,47 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
256
256
  }
257
257
  // Extract HTML from nested message/rfc822 sub-messages.
258
258
  collect_nested_message_html(&message, &mut all_html);
259
+
260
+ // Fallback: if no dedicated HTML body was found, check if the message
261
+ // parts include HTML content. For simple HTML emails, mail-parser might
262
+ // not expose HTML via body_html() but it's still in the parts.
263
+ if all_html.is_empty() {
264
+ use mail_parser::{MimeHeaders, PartType};
265
+ for part in &message.parts {
266
+ if let Some(ct) = part.content_type() {
267
+ let is_html = ct.subtype().map(|s| s.eq_ignore_ascii_case("html")).unwrap_or(false);
268
+ if is_html {
269
+ match &part.body {
270
+ PartType::Text(t) | PartType::Html(t) => {
271
+ all_html.push(t.to_string());
272
+ }
273
+ _ => {}
274
+ }
275
+ }
276
+ }
277
+ }
278
+ }
279
+
280
+ // Final fallback: if still no HTML found, manually extract body from raw bytes.
281
+ // Mail-parser sometimes doesn't parse simple single-part HTML emails correctly.
282
+ if all_html.is_empty()
283
+ && let Ok(data_str) = std::str::from_utf8(&data)
284
+ {
285
+ // Find the blank line that separates headers from body
286
+ // Try both CRLF and LF line endings
287
+ let body = if let Some(pos) = data_str.find("\r\n\r\n") {
288
+ &data_str[pos + 4..]
289
+ } else if let Some(pos) = data_str.find("\n\n") {
290
+ &data_str[pos + 2..]
291
+ } else {
292
+ ""
293
+ };
294
+
295
+ if !body.is_empty() {
296
+ all_html.push(body.to_string());
297
+ }
298
+ }
299
+
259
300
  if all_html.is_empty() {
260
301
  None
261
302
  } else {
@@ -264,11 +305,27 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
264
305
  };
265
306
 
266
307
  let cleaned_text = if let Some(ref plain) = plain_text {
267
- plain.clone()
308
+ // If plain_text contains HTML tags, treat it as HTML
309
+ if plain.contains("<html") || plain.contains("<body") || plain.contains("<!DOCTYPE") {
310
+ clean_html_content(plain)
311
+ } else {
312
+ plain.clone()
313
+ }
268
314
  } else if let Some(html) = &html_content {
269
315
  clean_html_content(html)
270
316
  } else {
271
- String::new()
317
+ // Last resort: if no plain text or extracted HTML, try body_text(0)
318
+ // which might contain HTML content for pure HTML emails
319
+ if let Some(text) = message.body_text(0) {
320
+ // Check if this is actually HTML content
321
+ if text.contains("<html") || text.contains("<body") || text.contains("<!DOCTYPE") {
322
+ clean_html_content(&text)
323
+ } else {
324
+ text.to_string()
325
+ }
326
+ } else {
327
+ String::new()
328
+ }
272
329
  };
273
330
 
274
331
  let mut attachments = Vec::with_capacity(message.attachments().count().min(20));
@@ -1310,7 +1367,18 @@ fn clean_html_content(html: &str) -> String {
1310
1367
  return String::new();
1311
1368
  }
1312
1369
 
1313
- // Use html-to-markdown converter in plain text mode when available
1370
+ // First try: regex-based HTML stripping (most reliable)
1371
+ let cleaned = script_regex().replace_all(html, "");
1372
+ let cleaned = style_regex().replace_all(&cleaned, "");
1373
+ let cleaned = html_tag_regex().replace_all(&cleaned, "");
1374
+ let cleaned = whitespace_regex().replace_all(&cleaned, " ");
1375
+ let text = cleaned.trim().to_string();
1376
+
1377
+ if !text.is_empty() {
1378
+ return text;
1379
+ }
1380
+
1381
+ // Fallback: try html-to-markdown converter if regex stripping produced nothing
1314
1382
  #[cfg(feature = "html")]
1315
1383
  {
1316
1384
  if let Ok(text) = crate::extraction::html::convert_html_to_markdown(
@@ -1325,13 +1393,7 @@ fn clean_html_content(html: &str) -> String {
1325
1393
  }
1326
1394
  }
1327
1395
 
1328
- // Fallback: regex-based HTML stripping
1329
- let cleaned = script_regex().replace_all(html, "");
1330
- let cleaned = style_regex().replace_all(&cleaned, "");
1331
- let cleaned = html_tag_regex().replace_all(&cleaned, "");
1332
- let cleaned = whitespace_regex().replace_all(&cleaned, " ");
1333
-
1334
- cleaned.trim().to_string()
1396
+ String::new()
1335
1397
  }
1336
1398
 
1337
1399
  fn is_image_mime_type(mime_type: &str) -> bool {
@@ -342,8 +342,8 @@ pub fn load_image_for_ocr(image_bytes: &[u8]) -> Result<image::DynamicImage> {
342
342
  } else if is_jbig2(image_bytes) {
343
343
  decode_jbig2_to_gray(image_bytes).map(image::DynamicImage::ImageLuma8)
344
344
  } else {
345
- image::load_from_memory(image_bytes)
346
- .map_err(|e| KreuzbergError::parsing(format!("Failed to decode image: {}", e)))
345
+ crate::utils::image_decode::decode_with_pixel_cap(image_bytes)
346
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to decode image: {e}")))
347
347
  }
348
348
  }
349
349
 
@@ -43,6 +43,7 @@ use crate::types::{ExtractedImage, ExtractionResult};
43
43
  pub async fn process_images_with_ocr(
44
44
  mut images: Vec<ExtractedImage>,
45
45
  config: &crate::core::config::ExtractionConfig,
46
+ warnings: &mut Vec<crate::types::ProcessingWarning>,
46
47
  ) -> crate::Result<Vec<ExtractedImage>> {
47
48
  if images.is_empty() || config.ocr.is_none() {
48
49
  return Ok(images);
@@ -125,7 +126,11 @@ pub async fn process_images_with_ocr(
125
126
  };
126
127
  images[idx].ocr_result = Some(Box::new(extraction_result));
127
128
  }
128
- Err(_) => {
129
+ Err(e) => {
130
+ warnings.push(crate::types::ProcessingWarning {
131
+ source: std::borrow::Cow::Borrowed("image_ocr"),
132
+ message: std::borrow::Cow::Owned(format!("Image {} OCR failed: {}", idx, e)),
133
+ });
129
134
  images[idx].ocr_result = None;
130
135
  }
131
136
  }