RubyGems - kreuzberg - Versions diffs - 4.4.0 → 4.4.1 - Mend

kreuzberg 4.4.0 → 4.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +1 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/extraction/email.rs +215 -43
data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +39 -7
data/vendor/kreuzberg/src/extractors/rtf/parser.rs +148 -37
data/vendor/kreuzberg/src/ocr/cache.rs +1 -0
data/vendor/kreuzberg/src/ocr/conversion.rs +1 -1
data/vendor/kreuzberg/src/ocr/processor/execution.rs +210 -2
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +44 -10
data/vendor/kreuzberg/src/types/formats.rs +17 -0
data/vendor/kreuzberg/tests/ocr_table_inline.rs +277 -0
data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 60064820e029a70308a28ac0f1232b62992511dda4b1f62f5ab9a4c83f3ac8ef
-  data.tar.gz: c564b12ca29c17695be86b2da184be0fc7666be15f2fca2dd26972b85cc93c8c
+  metadata.gz: 56e750209fdd4c61b193bbc25ce1f7e7f3646cee22fcd7ee79af381aa1c95561
+  data.tar.gz: 2b741797f40b209ad5b8451aba4adae514914ee6f8dd86a55b8bbf7d5e910e98
 SHA512:
-  metadata.gz: 162f3915a9e8e4cc51f163e053f284b45f2e228cc0cb1b3f2797a3aafdfe9ddafeb3177c842b5b2d232689d9304e3c9465a80adf93085e39776b76d1719adeed
-  data.tar.gz: 28802dd0a439b8a1d778143a650bae055dd73597c7edbb9db5aa6085e938da529f46216aa4cabf0f583329c9fadf49e5940b6e6376b267eff2baa56c78faff13
+  metadata.gz: 7110c61739f8a373080d03a017ab674713831df05acbb64e6f1c8d8fa6d7ca8e365f3f9cac3a0b47f046b7bd1778e4e01488142d8c1c1de355570363ce710210
+  data.tar.gz: d7a02f18c7e656475bb54081885eb6d82031bd76cc3f5515a68561191ccc5051216157924cfc36555da804afca90e21634780592163bdffad299cd4bc1a5fb0f

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    kreuzberg (4.4.0)
+    kreuzberg (4.4.1)
       rb_sys (~> 0.9.119)
       sorbet-runtime (~> 0.5)
@@ -222,7 +222,7 @@ CHECKSUMS
   io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
   json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
   json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
-  kreuzberg (4.4.0)
+  kreuzberg (4.4.1)
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.0" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.1" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -45,7 +45,7 @@ collapsible_if = "allow"
 [package]
 name = "kreuzberg-rb"
-version = "4.4.0"
+version = "4.4.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.4.0'
+  VERSION = '4.4.1'
 end

data/vendor/Cargo.toml CHANGED Viewed

@@ -2,7 +2,7 @@
 members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
 [workspace.package]
-version = "4.4.0"
+version = "4.4.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.4.0"
+version = "4.4.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.4.0 Release**
+> **🚀 Version 4.4.1 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/extraction/email.rs CHANGED Viewed

@@ -43,11 +43,11 @@ fn html_tag_regex() -> &'static Regex {
 }
 fn script_regex() -> &'static Regex {
-    SCRIPT_RE.get_or_init(|| Regex::new(r"(?i)<script[^>]*>.*?</script>").unwrap())
+    SCRIPT_RE.get_or_init(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap())
 }
 fn style_regex() -> &'static Regex {
-    STYLE_RE.get_or_init(|| Regex::new(r"(?i)<style[^>]*>.*?</style>").unwrap())
+    STYLE_RE.get_or_init(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap())
 }
 fn whitespace_regex() -> &'static Regex {
@@ -166,16 +166,22 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
         })
         .unwrap_or_else(Vec::new);
-    let date = message
-        .header("Date")
-        .and_then(|hv| {
-            if let mail_parser::HeaderValue::Text(s) = hv {
-                Some(s.trim().to_string())
-            } else {
+    // Extract date: prefer the raw Date header text (preserves original format),
+    // falling back to mail_parser's parsed DateTime → RFC 3339.
+    // mail_parser parses standard RFC 2822 dates into HeaderValue::DateTime,
+    // losing the original string. For non-standard dates (ISO 8601, invalid strings),
+    // it may produce garbled output. We extract the raw header from the email bytes.
+    let date = extract_raw_date_header(&data).or_else(|| {
+        message.date().and_then(|d| {
+            let rfc3339 = d.to_rfc3339();
+            // Reject obviously garbled dates (year 2000, month 0)
+            if rfc3339.starts_with("2000-00") || rfc3339.starts_with("0000-") {
                 None
+            } else {
+                Some(rfc3339)
             }
         })
-        .or_else(|| message.date().map(|d| d.to_rfc3339()));
+    });
     let message_id = message.message_id().map(|id| id.to_string());
@@ -336,25 +342,25 @@ fn extract_msg_from_cfb<F: std::io::Read + std::io::Seek>(
         Some(name) if !name.is_empty() => format!("\"{}\" <{}>", name, email),
         _ => email.clone(),
     });
-    let display_to = read_msg_string_prop(comp, "", 0x0E04); // PR_DISPLAY_TO
-    let display_cc = read_msg_string_prop(comp, "", 0x0E03); // PR_DISPLAY_CC
-    let display_bcc = read_msg_string_prop(comp, "", 0x0E02); // PR_DISPLAY_BCC
     let body = read_msg_string_prop(comp, "", 0x1000); // PR_BODY
     let html_body = read_msg_string_prop(comp, "", 0x1013); // PR_BODY_HTML
     let message_id = read_msg_string_prop(comp, "", 0x1035) // PR_INTERNET_MESSAGE_ID
         .filter(|s| !s.is_empty());
-    let headers = read_msg_string_prop(comp, "", 0x007D); // PR_TRANSPORT_MESSAGE_HEADERS
-    // Parse date from transport headers (e.g. "Date: Mon, 1 Jan 2024 …").
-    let date = headers.as_ref().and_then(|h| {
-        h.lines()
-            .find(|line| line.starts_with("Date:"))
-            .map(|line| line.trim_start_matches("Date:").trim().to_string())
-    });
+    // --- date: prefer PR_CLIENT_SUBMIT_TIME, fall back to transport headers ---
+    let date = read_msg_filetime_prop(comp, "", 0x0039) // PR_CLIENT_SUBMIT_TIME
+        .or_else(|| read_msg_filetime_prop(comp, "", 0x0E06)) // PR_MESSAGE_DELIVERY_TIME
+        .or_else(|| {
+            let headers = read_msg_string_prop(comp, "", 0x007D); // PR_TRANSPORT_MESSAGE_HEADERS
+            headers.as_ref().and_then(|h| {
+                h.lines()
+                    .find(|line| line.starts_with("Date:"))
+                    .map(|line| line.trim_start_matches("Date:").trim().to_string())
+            })
+        });
-    let to_emails = split_display_addresses(&display_to);
-    let cc_emails = split_display_addresses(&display_cc);
-    let bcc_emails = split_display_addresses(&display_bcc);
+    // --- recipients: read from substorages for full email addresses -----------
+    let (to_emails, cc_emails, bcc_emails) = read_msg_recipients(comp);
     let plain_text = body.filter(|s| !s.is_empty());
     let html_content = html_body.filter(|s| !s.is_empty());
@@ -488,15 +494,188 @@ fn decode_utf16le_bytes(data: &[u8]) -> String {
     String::from_utf16_lossy(&u16s).trim_end_matches('\0').to_string()
 }
-/// Split semicolon/comma-separated display addresses into individual strings.
-fn split_display_addresses(display: &Option<String>) -> Vec<String> {
-    display
-        .as_deref()
-        .unwrap_or("")
-        .split([';', ','])
-        .map(|s| s.trim().to_string())
-        .filter(|s| !s.is_empty())
-        .collect()
+/// Read a PT_SYSTIME (FILETIME) property from the __properties_version1.0 stream
+/// and convert it to an ISO 8601 date string.
+///
+/// FILETIME is a 64-bit value representing 100-nanosecond intervals since 1601-01-01.
+fn read_msg_filetime_prop<F: std::io::Read + std::io::Seek>(
+    comp: &mut cfb::CompoundFile<F>,
+    base: &str,
+    prop_id: u16,
+) -> Option<String> {
+    use std::io::Read;
+    let props_path = format!("{base}/__properties_version1.0");
+    let mut stream = comp.open_stream(&props_path).ok()?;
+    let mut buf = Vec::new();
+    stream.read_to_end(&mut buf).ok()?;
+    // Message-level properties have a 32-byte header; recipient/attachment have 8-byte.
+    let header_size: usize = if base.is_empty() { 32 } else { 8 };
+    let mut offset = header_size;
+    while offset + 16 <= buf.len() {
+        // MAPI property entry: prop_type (2) + prop_id (2) + flags (4) + value (8)
+        let ptype = u16::from_le_bytes([buf[offset], buf[offset + 1]]);
+        let pid = u16::from_le_bytes([buf[offset + 2], buf[offset + 3]]);
+        if pid == prop_id && ptype == 0x0040 {
+            // PT_SYSTIME
+            let filetime = u64::from_le_bytes(buf[offset + 8..offset + 16].try_into().ok()?);
+            return filetime_to_iso8601(filetime);
+        }
+        offset += 16;
+    }
+    None
+}
+/// Convert a Windows FILETIME (100-ns intervals since 1601-01-01) to ISO 8601.
+fn filetime_to_iso8601(filetime: u64) -> Option<String> {
+    // Epoch offset: difference between 1601-01-01 and 1970-01-01 in 100-ns intervals
+    const EPOCH_DIFF: u64 = 116_444_736_000_000_000;
+    if filetime < EPOCH_DIFF {
+        return None;
+    }
+    let hundred_ns = filetime - EPOCH_DIFF;
+    let secs = (hundred_ns / 10_000_000) as i64;
+    let nanos = ((hundred_ns % 10_000_000) * 100) as u32;
+    // Format manually to avoid pulling in chrono
+    let days_since_epoch = secs / 86400;
+    let time_of_day = secs % 86400;
+    let (hour, min, sec) = (time_of_day / 3600, (time_of_day % 3600) / 60, time_of_day % 60);
+    // Civil date calculation from days since 1970-01-01 (algorithm from Howard Hinnant)
+    let z = days_since_epoch + 719468;
+    let era = (if z >= 0 { z } else { z - 146096 }) / 146097;
+    let doe = z - era * 146097;
+    let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
+    let y = yoe + era * 400;
+    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
+    let mp = (5 * doy + 2) / 153;
+    let d = doy - (153 * mp + 2) / 5 + 1;
+    let m = if mp < 10 { mp + 3 } else { mp - 9 };
+    let y = if m <= 2 { y + 1 } else { y };
+    if nanos == 0 {
+        Some(format!("{y:04}-{m:02}-{d:02}T{hour:02}:{min:02}:{sec:02}+00:00"))
+    } else {
+        // Include sub-second precision
+        let frac = nanos / 1_000_000; // milliseconds
+        Some(format!(
+            "{y:04}-{m:02}-{d:02}T{hour:02}:{min:02}:{sec:02}.{frac:03}+00:00"
+        ))
+    }
+}
+/// Read recipients from MSG __recip_version1.0_#XXXXXXXX substorages.
+///
+/// Returns (to, cc, bcc) vectors. Each entry is formatted as `"Name" <email>` or just `email`.
+fn read_msg_recipients<F: std::io::Read + std::io::Seek>(
+    comp: &mut cfb::CompoundFile<F>,
+) -> (Vec<String>, Vec<String>, Vec<String>) {
+    // Collect recipient storage paths
+    let recip_paths: Vec<String> = comp
+        .walk()
+        .filter(|e| e.is_storage() && e.name().starts_with("__recip_version1.0_"))
+        .map(|e| e.path().to_string_lossy().into_owned())
+        .collect();
+    let mut to_emails = Vec::new();
+    let mut cc_emails = Vec::new();
+    let mut bcc_emails = Vec::new();
+    for path in &recip_paths {
+        let display_name = read_msg_string_prop(comp, path, 0x3001); // PR_DISPLAY_NAME
+        let email_addr = read_msg_string_prop(comp, path, 0x39FE) // PR_SMTP_ADDRESS
+            .or_else(|| read_msg_string_prop(comp, path, 0x3003)) // PR_EMAIL_ADDRESS
+            .filter(|s| !s.is_empty());
+        let formatted = match (&display_name, &email_addr) {
+            (Some(name), Some(email)) if !name.is_empty() && name != email => {
+                format!("\"{}\" <{}>", name, email)
+            }
+            (_, Some(email)) => email.clone(),
+            (Some(name), None) if !name.is_empty() => name.clone(),
+            _ => continue,
+        };
+        // Read PR_RECIPIENT_TYPE from properties stream
+        let recip_type = read_msg_recip_type(comp, path);
+        match recip_type {
+            1 => to_emails.push(formatted),  // MAPI_TO
+            2 => cc_emails.push(formatted),  // MAPI_CC
+            3 => bcc_emails.push(formatted), // MAPI_BCC
+            _ => to_emails.push(formatted),  // Default to To
+        }
+    }
+    (to_emails, cc_emails, bcc_emails)
+}
+/// Read PR_RECIPIENT_TYPE (0x0C15) from a recipient's __properties_version1.0 stream.
+/// Returns 1 (To), 2 (CC), 3 (BCC), or 0 if not found.
+fn read_msg_recip_type<F: std::io::Read + std::io::Seek>(comp: &mut cfb::CompoundFile<F>, base: &str) -> u32 {
+    use std::io::Read;
+    let props_path = format!("{base}/__properties_version1.0");
+    let mut stream = match comp.open_stream(&props_path) {
+        Ok(s) => s,
+        Err(_) => return 0,
+    };
+    let mut buf = Vec::new();
+    if stream.read_to_end(&mut buf).is_err() {
+        return 0;
+    }
+    // Recipient properties have 8-byte header
+    let mut offset = 8;
+    while offset + 16 <= buf.len() {
+        // MAPI property entry: prop_type (2) + prop_id (2) + flags (4) + value (8)
+        let ptype = u16::from_le_bytes([buf[offset], buf[offset + 1]]);
+        let pid = u16::from_le_bytes([buf[offset + 2], buf[offset + 3]]);
+        if pid == 0x0C15 && ptype == 0x0003 {
+            // PT_LONG
+            return u32::from_le_bytes([buf[offset + 8], buf[offset + 9], buf[offset + 10], buf[offset + 11]]);
+        }
+        offset += 16;
+    }
+    0
+}
+/// Extract the raw Date header value from email bytes.
+///
+/// Scans for `Date:` in the header section (before the blank line that separates
+/// headers from body) and returns the raw value, handling continuation lines.
+fn extract_raw_date_header(data: &[u8]) -> Option<String> {
+    let text = std::str::from_utf8(data).ok()?;
+    // Find the end of headers (blank line)
+    let header_end = text
+        .find("\r\n\r\n")
+        .or_else(|| text.find("\n\n"))
+        .unwrap_or(text.len().min(8192)); // Cap scan to 8KB
+    let headers = &text[..header_end];
+    // Find Date: header (case-insensitive start, then exact field name)
+    let mut date_value = None;
+    for line in headers.lines() {
+        if let Some(val) = line.strip_prefix("Date:").or_else(|| line.strip_prefix("date:")) {
+            date_value = Some(val.trim().to_string());
+        } else if date_value.is_some() && (line.starts_with(' ') || line.starts_with('\t')) {
+            // Continuation line (folded header)
+            if let Some(ref mut dv) = date_value {
+                dv.push(' ');
+                dv.push_str(line.trim());
+            }
+        } else if date_value.is_some() {
+            break; // Next header field
+        }
+    }
+    date_value.filter(|s| !s.is_empty())
 }
 /// Extract email content from either .eml or .msg format
@@ -545,17 +724,8 @@ pub fn build_email_text_output(result: &EmailExtractionResult) -> String {
     text_parts.push(result.cleaned_text.clone());
-    if !result.attachments.is_empty() {
-        let mut attachment_names = Vec::with_capacity(result.attachments.len().min(20));
-        for att in &result.attachments {
-            if let Some(name) = att.name.as_ref().or(att.filename.as_ref()) {
-                attachment_names.push(name.clone());
-            }
-        }
-        if !attachment_names.is_empty() {
-            text_parts.push(format!("Attachments: {}", attachment_names.join(", ")));
-        }
-    }
+    // Attachment names are stored in metadata but not included in the text output.
+    // This keeps the text output focused on message content.
     text_parts.join("\n")
 }
@@ -790,7 +960,9 @@ mod tests {
         };
         let output = build_email_text_output(&result);
-        assert!(output.contains("Attachments: file.txt"));
+        // Attachment names are stored in metadata, not in text output
+        assert!(!output.contains("Attachments:"));
+        assert!(output.contains("Hello World"));
     }
     #[test]

data/vendor/kreuzberg/src/extractors/rtf/formatting.rs CHANGED Viewed

@@ -1,15 +1,47 @@
 //! Text formatting utilities for RTF content.
-/// Normalize whitespace in a string using a single-pass algorithm.
+/// Normalize whitespace in a string.
 ///
-/// Collapses multiple consecutive whitespace characters into single spaces
-/// and trims leading/trailing whitespace.
+/// - Collapses multiple consecutive spaces/tabs into a single space
+/// - Preserves single newlines (paragraph breaks from \par)
+/// - Collapses multiple consecutive newlines into a double newline
+/// - Trims leading/trailing whitespace from each line
+/// - Trims leading/trailing blank lines
 pub fn normalize_whitespace(s: &str) -> String {
-    let mut result = String::with_capacity(s.len());
-    let mut last_was_space = false;
+    // Split into lines, trim each, collapse blank runs
+    let mut lines: Vec<&str> = Vec::new();
+    let mut last_blank = false;
+    for line in s.split('\n') {
+        // Collapse internal whitespace on each line
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            if !last_blank && !lines.is_empty() {
+                lines.push("");
+                last_blank = true;
+            }
+        } else {
+            last_blank = false;
+            lines.push(trimmed);
+        }
+    }
-    for ch in s.chars() {
-        if ch.is_whitespace() {
+    // Trim trailing blank lines
+    while lines.last() == Some(&"") {
+        lines.pop();
+    }
+    // Join and collapse internal multi-spaces within each line
+    let joined = lines.join("\n");
+    // Collapse runs of spaces within lines
+    let mut result = String::with_capacity(joined.len());
+    let mut last_was_space = false;
+    for ch in joined.chars() {
+        if ch == '\n' {
+            result.push('\n');
+            last_was_space = false;
+        } else if ch == ' ' || ch == '\t' {
             if !last_was_space {
                 result.push(' ');
                 last_was_space = true;