RubyGems - kreuzberg - Versions diffs - 4.9.4 → 4.9.6 - Mend

kreuzberg 4.9.4 → 4.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
data/ext/kreuzberg_rb/native/src/config/types.rs +1 -0
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +5 -5
data/vendor/kreuzberg/Cargo.toml +3 -3
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/core/config/ocr.rs +8 -0
data/vendor/kreuzberg/src/core/config/processing.rs +72 -14
data/vendor/kreuzberg/src/core/extractor/bytes.rs +27 -3
data/vendor/kreuzberg/src/core/extractor/file.rs +27 -3
data/vendor/kreuzberg/src/core/pipeline/mod.rs +26 -20
data/vendor/kreuzberg/src/doc_orientation.rs +1 -1
data/vendor/kreuzberg/src/extraction/email.rs +72 -10
data/vendor/kreuzberg/src/extraction/image.rs +2 -2
data/vendor/kreuzberg/src/extraction/image_ocr.rs +6 -1
data/vendor/kreuzberg/src/extraction/transform/content.rs +249 -4
data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -5
data/vendor/kreuzberg/src/extractors/email.rs +12 -11
data/vendor/kreuzberg/src/extractors/hwp.rs +18 -5
data/vendor/kreuzberg/src/extractors/image.rs +11 -6
data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +16 -2
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +46 -16
data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +26 -8
data/vendor/kreuzberg/src/mcp/params.rs +17 -1
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +1 -0
data/vendor/kreuzberg/src/ocr/types.rs +11 -1
data/vendor/kreuzberg/src/ort_discovery.rs +74 -22
data/vendor/kreuzberg/src/paddle_ocr/backend.rs +108 -10
data/vendor/kreuzberg/src/pdf/images.rs +134 -8
data/vendor/kreuzberg/src/pdf/structure/bridge.rs +4 -4
data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +7 -3
data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +9 -0
data/vendor/kreuzberg/src/rendering/djot.rs +8 -0
data/vendor/kreuzberg/src/rendering/markdown.rs +7 -0
data/vendor/kreuzberg/src/rendering/plain.rs +16 -7
data/vendor/kreuzberg/src/types/formats.rs +6 -2
data/vendor/kreuzberg/src/utils/image_decode.rs +99 -0
data/vendor/kreuzberg/src/utils/mod.rs +8 -0
data/vendor/kreuzberg/tests/docx_ocr_integration_test.rs +84 -0
data/vendor/kreuzberg/tests/email_integration.rs +18 -7
data/vendor/kreuzberg/tests/extraction_timeout_tests.rs +92 -0
data/vendor/kreuzberg/tests/gpu_acceleration.rs +419 -0
data/vendor/kreuzberg/tests/issue_797_preset_embedding_regression.rs +75 -0
data/vendor/kreuzberg/tests/markdown_lint_quality.rs +18 -6
data/vendor/kreuzberg/tests/mcp_integration.rs +13 -5
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +16 -20
data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +129 -0
data/vendor/kreuzberg/tests/test_batch_extract_schema.rs +56 -0
data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
data/vendor/kreuzberg-tesseract/build.rs +5 -0
metadata +8 -3
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6921

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3d8a203168595f6b316a165f500818abed75d89c7a82c46b5b20df996a4bb841
-  data.tar.gz: 28fd19fecd9b18597f17a783923ec3ec08cfa7b99612fec1ca8790aa5cdddbdc
+  metadata.gz: 9f3132b44aad1652c76e8b1445b775eb3586e48661908eda794c95339f06387d
+  data.tar.gz: 2f957af07040ec2f3bcd79c299dd429a752423d714eea73bfb608a28718a6c11
 SHA512:
-  metadata.gz: af522bff519c1082396d9a6a9480a088693791a4f50818fd1d233726082675559a3144f7758b0ee217b18cdbb1cd08236ecbb332f68c4186a26aa69b83454392
-  data.tar.gz: 4109e6dbc32c5fed518ba84940a7bc732553a178d3b67b69dad6eff5b998aad12b0a53cfe6c6d0784848cbb484e7b79a7abe9154f23d2100786f38414d8286c0
+  metadata.gz: 878748ecb791e049c2de05cdc4ec7b9f6749bb265981c98ea49126108ca7c2782b92a6b5ed31d1fbfbeee83e3c45c80aaf74aacecd20f9bc428d796709afa0aa
+  data.tar.gz: ff137eb78f8fcfcc2ac357b0d9adf6d3d6fee11a448679a976678e0745905a0abcd8abfeb331028d780810d96dc47a04ec01dda94c900ec63ad4b35c124c187f

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.4" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.6" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-rb"
-version = "4.9.4"
+version = "4.9.6"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
     "time",
     "io-util",
 ] }
-html-to-markdown-rs = { version = "3.2.6", default-features = false }
+html-to-markdown-rs = { version = "3.3.1", default-features = false }
 [dev-dependencies]
 pretty_assertions = "1.4"

data/ext/kreuzberg_rb/native/src/config/types.rs CHANGED Viewed

@@ -54,6 +54,7 @@ pub fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
         quality_thresholds: None,
         vlm_config: None,
         vlm_prompt: None,
+        acceleration: None,
     };
     if let Some(val) = get_kw(ruby, hash, "tesseract_config")

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.9.4'
+  VERSION = '4.9.6'
 end

data/vendor/Cargo.toml CHANGED Viewed

@@ -2,7 +2,7 @@
 members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
 [workspace.package]
-version = "4.9.4"
+version = "4.9.6"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -28,12 +28,12 @@ dbase = "0.7"
 futures = "0.3"
 getrandom = { version = "0.4.2", features = ["wasm_js"] }
 hex = "0.4.3"
-html-to-markdown-rs = { version = "3.2.6", default-features = false }
+html-to-markdown-rs = { version = "3.3.1", default-features = false }
 image = { version = "0.25.10", default-features = false }
 itertools = "0.14"
 js-sys = "0.3"
-kreuzberg = { path = "./crates/kreuzberg", version = "4.9.4", default-features = false }
-kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.4" }
+kreuzberg = { path = "./crates/kreuzberg", version = "4.9.6", default-features = false }
+kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.6" }
 lazy_static = "1.5.0"
 libc = "0.2.185"
 liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -57,7 +57,7 @@ thiserror = "2.0.18"
 tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
 toml = "1.1.2"
 tracing = "0.1"
-tree-sitter-language-pack = { version = "1.6.2", features = ["serde"], default-features = false }
+tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false }
 wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
 wasm-bindgen-futures = "0.4"
 web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.9.4"
+version = "4.9.6"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
     "simd",
 ], optional = true }
 hex = "0.4.3"
-html-to-markdown-rs = { version = "3.2.6", default-features = false, features = [
+html-to-markdown-rs = { version = "3.3.1", default-features = false, features = [
     "inline-images",
     "metadata",
 ], optional = true }
@@ -392,7 +392,7 @@ optional = true
 # Override getrandom to enable js feature for WASM targets
 # This is needed because ring/rustls (via ureq) depend on getrandom without js feature
 getrandom = { version = "0.4.2", features = ["wasm_js"] }
-tree-sitter-language-pack = { version = "1.6.2", features = ["serde"], default-features = false, optional = true }
+tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false, optional = true }
 wasm-bindgen-rayon = { version = "1.3", optional = true }
 [build-dependencies]

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.9.4 Release**
+> **🚀 Version 4.9.6 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/core/config/ocr.rs CHANGED Viewed

@@ -271,6 +271,13 @@ pub struct OcrConfig {
     /// - `{{ language }}` — The document language code (e.g., "eng", "deu").
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub vlm_prompt: Option<String>,
+    /// Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
+    ///
+    /// Not user-configurable via config files — injected at runtime from
+    /// `ExtractionConfig::acceleration` before each `process_image` call.
+    #[serde(skip)]
+    pub acceleration: Option<super::acceleration::AccelerationConfig>,
 }
 impl Default for OcrConfig {
@@ -288,6 +295,7 @@ impl Default for OcrConfig {
             auto_rotate: false,
             vlm_config: None,
             vlm_prompt: None,
+            acceleration: None,
         }
     }
 }

data/vendor/kreuzberg/src/core/config/processing.rs CHANGED Viewed

@@ -267,15 +267,10 @@ impl ChunkingConfig {
             }
         };
-        let embedding = match &self.embedding {
-            Some(existing) => Some(existing.clone()),
-            None => Some(EmbeddingConfig {
-                model: EmbeddingModelType::Preset {
-                    name: preset_name.clone(),
-                },
-                ..EmbeddingConfig::default()
-            }),
-        };
+        // Preserve the caller's embedding choice, including None.
+        // Presets configure chunking parameters only; users must explicitly
+        // provide an EmbeddingConfig to opt into embedding generation.
+        let embedding = self.embedding.clone();
         Self {
             max_characters: preset.chunk_size,
@@ -568,11 +563,9 @@ mod tests {
         let resolved = config.resolve_preset();
         assert_eq!(resolved.max_characters, 1024);
         assert_eq!(resolved.overlap, 100);
-        assert!(resolved.embedding.is_some());
-        match &resolved.embedding.unwrap().model {
-            EmbeddingModelType::Preset { name } => assert_eq!(name, "balanced"),
-            _ => panic!("Expected Preset model type"),
-        }
+        // Preset configures chunking parameters only; embedding stays None unless
+        // the caller explicitly provided one (#797).
+        assert!(resolved.embedding.is_none());
     }
     #[test]
@@ -686,4 +679,69 @@ mod tests {
             _ => panic!("Expected Custom variant"),
         }
     }
+    // --- Issue #797 regression tests ---
+    /// Preset with no explicit embedding: embedding must remain None.
+    ///
+    /// Before the fix, `resolve_preset()` would silently inject an
+    /// `EmbeddingConfig` whenever a preset was configured, causing every
+    /// chunk to have an unexpected `.embedding` field populated.
+    #[test]
+    #[cfg(feature = "embeddings")]
+    fn test_resolve_preset_does_not_inject_embedding_when_none() {
+        let config = ChunkingConfig {
+            preset: Some("multilingual".to_string()),
+            embedding: None,
+            ..Default::default()
+        };
+        let resolved = config.resolve_preset();
+        assert!(
+            resolved.embedding.is_none(),
+            "preset alone must not inject an EmbeddingConfig (#797)"
+        );
+    }
+    /// Preset with an explicit embedding: the embedding must be preserved unchanged.
+    #[test]
+    #[cfg(feature = "embeddings")]
+    fn test_resolve_preset_preserves_explicit_embedding_config() {
+        let explicit = EmbeddingConfig {
+            model: EmbeddingModelType::Custom {
+                model_id: "my-org/model".to_string(),
+                dimensions: 768,
+            },
+            batch_size: 16,
+            ..Default::default()
+        };
+        let config = ChunkingConfig {
+            preset: Some("multilingual".to_string()),
+            embedding: Some(explicit),
+            ..Default::default()
+        };
+        let resolved = config.resolve_preset();
+        let emb = resolved
+            .embedding
+            .expect("explicit embedding must survive resolve_preset");
+        assert_eq!(emb.batch_size, 16);
+        match emb.model {
+            EmbeddingModelType::Custom { model_id, dimensions } => {
+                assert_eq!(model_id, "my-org/model");
+                assert_eq!(dimensions, 768);
+            }
+            other => panic!("expected Custom model type, got {other:?}"),
+        }
+    }
+    /// No preset, no embedding: embedding must stay None (regression guard).
+    #[test]
+    fn test_resolve_preset_no_preset_no_embedding_stays_none() {
+        let config = ChunkingConfig {
+            preset: None,
+            embedding: None,
+            ..Default::default()
+        };
+        let resolved = config.resolve_preset();
+        assert!(resolved.embedding.is_none(), "no-preset path must not touch embedding");
+    }
 }

data/vendor/kreuzberg/src/core/extractor/bytes.rs CHANGED Viewed

@@ -66,7 +66,7 @@ use super::file::extract_bytes_with_extractor;
 pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
     use crate::core::mime;
-    let result = async {
+    let extraction_future = async {
         if config.force_ocr && config.effective_disable_ocr() {
             return Err(crate::KreuzbergError::Validation {
                 message: "force_ocr and disable_ocr cannot both be true".to_string(),
@@ -105,8 +105,32 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
         }
         extract_bytes_with_extractor(content, &validated_mime, config).await
-    }
-    .await;
+    };
+    #[cfg(feature = "tokio-runtime")]
+    let result = if let Some(secs) = config.extraction_timeout_secs {
+        let start = std::time::Instant::now();
+        match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
+            Ok(inner) => inner,
+            Err(_elapsed) => {
+                if let Some(ref token) = config.cancel_token {
+                    token.cancel();
+                }
+                Err(crate::KreuzbergError::Timeout {
+                    elapsed_ms: start.elapsed().as_millis() as u64,
+                    limit_ms: secs * 1000,
+                })
+            }
+        }
+    } else {
+        extraction_future.await
+    };
+    #[cfg(not(feature = "tokio-runtime"))]
+    let result = {
+        let _ = config.extraction_timeout_secs;
+        extraction_future.await
+    };
     #[cfg(feature = "otel")]
     if let Err(ref e) = result {

data/vendor/kreuzberg/src/core/extractor/file.rs CHANGED Viewed

@@ -82,7 +82,7 @@ pub async fn extract_file(
         );
     }
-    let result = async {
+    let extraction_future = async {
         io::validate_file_exists(path)?;
         if config.force_ocr && config.effective_disable_ocr() {
@@ -119,8 +119,32 @@ pub async fn extract_file(
         }
         extract_file_with_extractor(path, &detected_mime, config).await
-    }
-    .await;
+    };
+    #[cfg(feature = "tokio-runtime")]
+    let result = if let Some(secs) = config.extraction_timeout_secs {
+        let start = std::time::Instant::now();
+        match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
+            Ok(inner) => inner,
+            Err(_elapsed) => {
+                if let Some(ref token) = config.cancel_token {
+                    token.cancel();
+                }
+                Err(crate::KreuzbergError::Timeout {
+                    elapsed_ms: start.elapsed().as_millis() as u64,
+                    limit_ms: secs * 1000,
+                })
+            }
+        }
+    } else {
+        extraction_future.await
+    };
+    #[cfg(not(feature = "tokio-runtime"))]
+    let result = {
+        let _ = config.extraction_timeout_secs;
+        extraction_future.await
+    };
     #[cfg(feature = "otel")]
     if let Err(ref e) = result {

data/vendor/kreuzberg/src/core/pipeline/mod.rs CHANGED Viewed

@@ -55,6 +55,8 @@ use initialization::{get_processors_from_cache, initialize_features, initialize_
     )
 ))]
 pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
+    #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
+    let mut doc = doc;
     // Pre-render markdown for the chunker's heading context resolution when:
     // - Markdown chunking is configured
     // - Output format is not already Markdown (which would produce formatted_content anyway)
@@ -100,7 +102,30 @@ pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> R
         }
     };
-    // 1. Derive ExtractionResult from InternalDocument
+    // 1. Process extracted images with OCR if configured
+    #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
+    if config.ocr.is_some() && !doc.images.is_empty() {
+        let images_to_process = std::mem::take(&mut doc.images);
+        match crate::extraction::image_ocr::process_images_with_ocr(
+            images_to_process,
+            config,
+            &mut doc.processing_warnings,
+        )
+        .await
+        {
+            Ok(processed) => {
+                doc.images = processed;
+            }
+            Err(e) => {
+                doc.processing_warnings.push(crate::types::ProcessingWarning {
+                    source: std::borrow::Cow::Borrowed("image_ocr"),
+                    message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
+                });
+            }
+        }
+    }
+    // 2. Derive ExtractionResult from InternalDocument
     let include_structure = config.include_document_structure;
     let mut result =
         crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
@@ -111,25 +136,6 @@ pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> R
         result.formatted_content = Some(html);
     }
-    // 1.5. Process extracted images with OCR if configured
-    #[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
-    if config.ocr.is_some() && result.images.as_ref().is_some_and(|imgs| !imgs.is_empty()) {
-        let images_to_process = result.images.take().unwrap_or_default();
-        match crate::extraction::image_ocr::process_images_with_ocr(images_to_process, config).await {
-            Ok(processed) => {
-                result.images = if processed.is_empty() { None } else { Some(processed) };
-            }
-            Err(e) => {
-                result
-                    .processing_warnings
-                    .push(crate::types::extraction::ProcessingWarning {
-                        source: std::borrow::Cow::Borrowed("image_ocr"),
-                        message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
-                    });
-            }
-        }
-    }
     // Temporarily store pre-rendered markdown for chunker heading context.
     // Tracked separately so we can remove it after chunking — apply_output_format
     // must not swap this into result.content when output_format is Plain.

data/vendor/kreuzberg/src/doc_orientation.rs CHANGED Viewed

@@ -229,7 +229,7 @@ pub fn resolve_cache_dir() -> PathBuf {
 /// Returns `Ok(Some(rotated_bytes))` if rotation was applied,
 /// `Ok(None)` if no rotation needed (0° or low confidence).
 pub fn detect_and_rotate(detector: &DocOrientationDetector, image_bytes: &[u8]) -> Result<Option<Vec<u8>>> {
-    let img = image::load_from_memory(image_bytes)
+    let img = crate::utils::image_decode::decode_with_pixel_cap(image_bytes)
         .map_err(|e| KreuzbergError::Ocr {
             message: format!("Failed to load image for orientation detection: {e}"),
             source: None,

data/vendor/kreuzberg/src/extraction/email.rs CHANGED Viewed

@@ -256,6 +256,47 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
         }
         // Extract HTML from nested message/rfc822 sub-messages.
         collect_nested_message_html(&message, &mut all_html);
+        // Fallback: if no dedicated HTML body was found, check if the message
+        // parts include HTML content. For simple HTML emails, mail-parser might
+        // not expose HTML via body_html() but it's still in the parts.
+        if all_html.is_empty() {
+            use mail_parser::{MimeHeaders, PartType};
+            for part in &message.parts {
+                if let Some(ct) = part.content_type() {
+                    let is_html = ct.subtype().map(|s| s.eq_ignore_ascii_case("html")).unwrap_or(false);
+                    if is_html {
+                        match &part.body {
+                            PartType::Text(t) | PartType::Html(t) => {
+                                all_html.push(t.to_string());
+                            }
+                            _ => {}
+                        }
+                    }
+                }
+            }
+        }
+        // Final fallback: if still no HTML found, manually extract body from raw bytes.
+        // Mail-parser sometimes doesn't parse simple single-part HTML emails correctly.
+        if all_html.is_empty()
+            && let Ok(data_str) = std::str::from_utf8(&data)
+        {
+            // Find the blank line that separates headers from body
+            // Try both CRLF and LF line endings
+            let body = if let Some(pos) = data_str.find("\r\n\r\n") {
+                &data_str[pos + 4..]
+            } else if let Some(pos) = data_str.find("\n\n") {
+                &data_str[pos + 2..]
+            } else {
+                ""
+            };
+            if !body.is_empty() {
+                all_html.push(body.to_string());
+            }
+        }
         if all_html.is_empty() {
             None
         } else {
@@ -264,11 +305,27 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
     };
     let cleaned_text = if let Some(ref plain) = plain_text {
-        plain.clone()
+        // If plain_text contains HTML tags, treat it as HTML
+        if plain.contains("<html") || plain.contains("<body") || plain.contains("<!DOCTYPE") {
+            clean_html_content(plain)
+        } else {
+            plain.clone()
+        }
     } else if let Some(html) = &html_content {
         clean_html_content(html)
     } else {
-        String::new()
+        // Last resort: if no plain text or extracted HTML, try body_text(0)
+        // which might contain HTML content for pure HTML emails
+        if let Some(text) = message.body_text(0) {
+            // Check if this is actually HTML content
+            if text.contains("<html") || text.contains("<body") || text.contains("<!DOCTYPE") {
+                clean_html_content(&text)
+            } else {
+                text.to_string()
+            }
+        } else {
+            String::new()
+        }
     };
     let mut attachments = Vec::with_capacity(message.attachments().count().min(20));
@@ -1310,7 +1367,18 @@ fn clean_html_content(html: &str) -> String {
         return String::new();
     }
-    // Use html-to-markdown converter in plain text mode when available
+    // First try: regex-based HTML stripping (most reliable)
+    let cleaned = script_regex().replace_all(html, "");
+    let cleaned = style_regex().replace_all(&cleaned, "");
+    let cleaned = html_tag_regex().replace_all(&cleaned, "");
+    let cleaned = whitespace_regex().replace_all(&cleaned, " ");
+    let text = cleaned.trim().to_string();
+    if !text.is_empty() {
+        return text;
+    }
+    // Fallback: try html-to-markdown converter if regex stripping produced nothing
     #[cfg(feature = "html")]
     {
         if let Ok(text) = crate::extraction::html::convert_html_to_markdown(
@@ -1325,13 +1393,7 @@ fn clean_html_content(html: &str) -> String {
         }
     }
-    // Fallback: regex-based HTML stripping
-    let cleaned = script_regex().replace_all(html, "");
-    let cleaned = style_regex().replace_all(&cleaned, "");
-    let cleaned = html_tag_regex().replace_all(&cleaned, "");
-    let cleaned = whitespace_regex().replace_all(&cleaned, " ");
-    cleaned.trim().to_string()
+    String::new()
 }
 fn is_image_mime_type(mime_type: &str) -> bool {

data/vendor/kreuzberg/src/extraction/image.rs CHANGED Viewed

@@ -342,8 +342,8 @@ pub fn load_image_for_ocr(image_bytes: &[u8]) -> Result<image::DynamicImage> {
     } else if is_jbig2(image_bytes) {
         decode_jbig2_to_gray(image_bytes).map(image::DynamicImage::ImageLuma8)
     } else {
-        image::load_from_memory(image_bytes)
-            .map_err(|e| KreuzbergError::parsing(format!("Failed to decode image: {}", e)))
+        crate::utils::image_decode::decode_with_pixel_cap(image_bytes)
+            .map_err(|e| KreuzbergError::parsing(format!("Failed to decode image: {e}")))
     }
 }

data/vendor/kreuzberg/src/extraction/image_ocr.rs CHANGED Viewed

@@ -43,6 +43,7 @@ use crate::types::{ExtractedImage, ExtractionResult};
 pub async fn process_images_with_ocr(
     mut images: Vec<ExtractedImage>,
     config: &crate::core::config::ExtractionConfig,
+    warnings: &mut Vec<crate::types::ProcessingWarning>,
 ) -> crate::Result<Vec<ExtractedImage>> {
     if images.is_empty() || config.ocr.is_none() {
         return Ok(images);
@@ -125,7 +126,11 @@ pub async fn process_images_with_ocr(
                 };
                 images[idx].ocr_result = Some(Box::new(extraction_result));
             }
-            Err(_) => {
+            Err(e) => {
+                warnings.push(crate::types::ProcessingWarning {
+                    source: std::borrow::Cow::Borrowed("image_ocr"),
+                    message: std::borrow::Cow::Owned(format!("Image {} OCR failed: {}", idx, e)),
+                });
                 images[idx].ocr_result = None;
             }
         }