kreuzberg 4.4.0 → 4.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 60064820e029a70308a28ac0f1232b62992511dda4b1f62f5ab9a4c83f3ac8ef
4
- data.tar.gz: c564b12ca29c17695be86b2da184be0fc7666be15f2fca2dd26972b85cc93c8c
3
+ metadata.gz: 56e750209fdd4c61b193bbc25ce1f7e7f3646cee22fcd7ee79af381aa1c95561
4
+ data.tar.gz: 2b741797f40b209ad5b8451aba4adae514914ee6f8dd86a55b8bbf7d5e910e98
5
5
  SHA512:
6
- metadata.gz: 162f3915a9e8e4cc51f163e053f284b45f2e228cc0cb1b3f2797a3aafdfe9ddafeb3177c842b5b2d232689d9304e3c9465a80adf93085e39776b76d1719adeed
7
- data.tar.gz: 28802dd0a439b8a1d778143a650bae055dd73597c7edbb9db5aa6085e938da529f46216aa4cabf0f583329c9fadf49e5940b6e6376b267eff2baa56c78faff13
6
+ metadata.gz: 7110c61739f8a373080d03a017ab674713831df05acbb64e6f1c8d8fa6d7ca8e365f3f9cac3a0b47f046b7bd1778e4e01488142d8c1c1de355570363ce710210
7
+ data.tar.gz: d7a02f18c7e656475bb54081885eb6d82031bd76cc3f5515a68561191ccc5051216157924cfc36555da804afca90e21634780592163bdffad299cd4bc1a5fb0f
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.4.0)
4
+ kreuzberg (4.4.1)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -222,7 +222,7 @@ CHECKSUMS
222
222
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
223
223
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
224
224
  json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
225
- kreuzberg (4.4.0)
225
+ kreuzberg (4.4.1)
226
226
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
227
227
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
228
228
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.1" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -45,7 +45,7 @@ collapsible_if = "allow"
45
45
 
46
46
  [package]
47
47
  name = "kreuzberg-rb"
48
- version = "4.4.0"
48
+ version = "4.4.1"
49
49
  edition = "2024"
50
50
  rust-version = "1.91"
51
51
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.4.0'
4
+ VERSION = '4.4.1'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.4.0"
5
+ version = "4.4.1"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.4.0"
3
+ version = "4.4.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.4.0 Release**
20
+ > **🚀 Version 4.4.1 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -43,11 +43,11 @@ fn html_tag_regex() -> &'static Regex {
43
43
  }
44
44
 
45
45
  fn script_regex() -> &'static Regex {
46
- SCRIPT_RE.get_or_init(|| Regex::new(r"(?i)<script[^>]*>.*?</script>").unwrap())
46
+ SCRIPT_RE.get_or_init(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap())
47
47
  }
48
48
 
49
49
  fn style_regex() -> &'static Regex {
50
- STYLE_RE.get_or_init(|| Regex::new(r"(?i)<style[^>]*>.*?</style>").unwrap())
50
+ STYLE_RE.get_or_init(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap())
51
51
  }
52
52
 
53
53
  fn whitespace_regex() -> &'static Regex {
@@ -166,16 +166,22 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
166
166
  })
167
167
  .unwrap_or_else(Vec::new);
168
168
 
169
- let date = message
170
- .header("Date")
171
- .and_then(|hv| {
172
- if let mail_parser::HeaderValue::Text(s) = hv {
173
- Some(s.trim().to_string())
174
- } else {
169
+ // Extract date: prefer the raw Date header text (preserves original format),
170
+ // falling back to mail_parser's parsed DateTime → RFC 3339.
171
+ // mail_parser parses standard RFC 2822 dates into HeaderValue::DateTime,
172
+ // losing the original string. For non-standard dates (ISO 8601, invalid strings),
173
+ // it may produce garbled output. We extract the raw header from the email bytes.
174
+ let date = extract_raw_date_header(&data).or_else(|| {
175
+ message.date().and_then(|d| {
176
+ let rfc3339 = d.to_rfc3339();
177
+ // Reject obviously garbled dates (year 2000, month 0)
178
+ if rfc3339.starts_with("2000-00") || rfc3339.starts_with("0000-") {
175
179
  None
180
+ } else {
181
+ Some(rfc3339)
176
182
  }
177
183
  })
178
- .or_else(|| message.date().map(|d| d.to_rfc3339()));
184
+ });
179
185
 
180
186
  let message_id = message.message_id().map(|id| id.to_string());
181
187
 
@@ -336,25 +342,25 @@ fn extract_msg_from_cfb<F: std::io::Read + std::io::Seek>(
336
342
  Some(name) if !name.is_empty() => format!("\"{}\" <{}>", name, email),
337
343
  _ => email.clone(),
338
344
  });
339
- let display_to = read_msg_string_prop(comp, "", 0x0E04); // PR_DISPLAY_TO
340
- let display_cc = read_msg_string_prop(comp, "", 0x0E03); // PR_DISPLAY_CC
341
- let display_bcc = read_msg_string_prop(comp, "", 0x0E02); // PR_DISPLAY_BCC
342
345
  let body = read_msg_string_prop(comp, "", 0x1000); // PR_BODY
343
346
  let html_body = read_msg_string_prop(comp, "", 0x1013); // PR_BODY_HTML
344
347
  let message_id = read_msg_string_prop(comp, "", 0x1035) // PR_INTERNET_MESSAGE_ID
345
348
  .filter(|s| !s.is_empty());
346
- let headers = read_msg_string_prop(comp, "", 0x007D); // PR_TRANSPORT_MESSAGE_HEADERS
347
349
 
348
- // Parse date from transport headers (e.g. "Date: Mon, 1 Jan 2024 …").
349
- let date = headers.as_ref().and_then(|h| {
350
- h.lines()
351
- .find(|line| line.starts_with("Date:"))
352
- .map(|line| line.trim_start_matches("Date:").trim().to_string())
353
- });
350
+ // --- date: prefer PR_CLIENT_SUBMIT_TIME, fall back to transport headers ---
351
+ let date = read_msg_filetime_prop(comp, "", 0x0039) // PR_CLIENT_SUBMIT_TIME
352
+ .or_else(|| read_msg_filetime_prop(comp, "", 0x0E06)) // PR_MESSAGE_DELIVERY_TIME
353
+ .or_else(|| {
354
+ let headers = read_msg_string_prop(comp, "", 0x007D); // PR_TRANSPORT_MESSAGE_HEADERS
355
+ headers.as_ref().and_then(|h| {
356
+ h.lines()
357
+ .find(|line| line.starts_with("Date:"))
358
+ .map(|line| line.trim_start_matches("Date:").trim().to_string())
359
+ })
360
+ });
354
361
 
355
- let to_emails = split_display_addresses(&display_to);
356
- let cc_emails = split_display_addresses(&display_cc);
357
- let bcc_emails = split_display_addresses(&display_bcc);
362
+ // --- recipients: read from substorages for full email addresses -----------
363
+ let (to_emails, cc_emails, bcc_emails) = read_msg_recipients(comp);
358
364
 
359
365
  let plain_text = body.filter(|s| !s.is_empty());
360
366
  let html_content = html_body.filter(|s| !s.is_empty());
@@ -488,15 +494,188 @@ fn decode_utf16le_bytes(data: &[u8]) -> String {
488
494
  String::from_utf16_lossy(&u16s).trim_end_matches('\0').to_string()
489
495
  }
490
496
 
491
- /// Split semicolon/comma-separated display addresses into individual strings.
492
- fn split_display_addresses(display: &Option<String>) -> Vec<String> {
493
- display
494
- .as_deref()
495
- .unwrap_or("")
496
- .split([';', ','])
497
- .map(|s| s.trim().to_string())
498
- .filter(|s| !s.is_empty())
499
- .collect()
497
+ /// Read a PT_SYSTIME (FILETIME) property from the __properties_version1.0 stream
498
+ /// and convert it to an ISO 8601 date string.
499
+ ///
500
+ /// FILETIME is a 64-bit value representing 100-nanosecond intervals since 1601-01-01.
501
+ fn read_msg_filetime_prop<F: std::io::Read + std::io::Seek>(
502
+ comp: &mut cfb::CompoundFile<F>,
503
+ base: &str,
504
+ prop_id: u16,
505
+ ) -> Option<String> {
506
+ use std::io::Read;
507
+
508
+ let props_path = format!("{base}/__properties_version1.0");
509
+ let mut stream = comp.open_stream(&props_path).ok()?;
510
+ let mut buf = Vec::new();
511
+ stream.read_to_end(&mut buf).ok()?;
512
+
513
+ // Message-level properties have a 32-byte header; recipient/attachment have 8-byte.
514
+ let header_size: usize = if base.is_empty() { 32 } else { 8 };
515
+ let mut offset = header_size;
516
+
517
+ while offset + 16 <= buf.len() {
518
+ // MAPI property entry: prop_type (2) + prop_id (2) + flags (4) + value (8)
519
+ let ptype = u16::from_le_bytes([buf[offset], buf[offset + 1]]);
520
+ let pid = u16::from_le_bytes([buf[offset + 2], buf[offset + 3]]);
521
+
522
+ if pid == prop_id && ptype == 0x0040 {
523
+ // PT_SYSTIME
524
+ let filetime = u64::from_le_bytes(buf[offset + 8..offset + 16].try_into().ok()?);
525
+ return filetime_to_iso8601(filetime);
526
+ }
527
+ offset += 16;
528
+ }
529
+ None
530
+ }
531
+
532
+ /// Convert a Windows FILETIME (100-ns intervals since 1601-01-01) to ISO 8601.
533
+ fn filetime_to_iso8601(filetime: u64) -> Option<String> {
534
+ // Epoch offset: difference between 1601-01-01 and 1970-01-01 in 100-ns intervals
535
+ const EPOCH_DIFF: u64 = 116_444_736_000_000_000;
536
+ if filetime < EPOCH_DIFF {
537
+ return None;
538
+ }
539
+ let hundred_ns = filetime - EPOCH_DIFF;
540
+ let secs = (hundred_ns / 10_000_000) as i64;
541
+ let nanos = ((hundred_ns % 10_000_000) * 100) as u32;
542
+
543
+ // Format manually to avoid pulling in chrono
544
+ let days_since_epoch = secs / 86400;
545
+ let time_of_day = secs % 86400;
546
+ let (hour, min, sec) = (time_of_day / 3600, (time_of_day % 3600) / 60, time_of_day % 60);
547
+
548
+ // Civil date calculation from days since 1970-01-01 (algorithm from Howard Hinnant)
549
+ let z = days_since_epoch + 719468;
550
+ let era = (if z >= 0 { z } else { z - 146096 }) / 146097;
551
+ let doe = z - era * 146097;
552
+ let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
553
+ let y = yoe + era * 400;
554
+ let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
555
+ let mp = (5 * doy + 2) / 153;
556
+ let d = doy - (153 * mp + 2) / 5 + 1;
557
+ let m = if mp < 10 { mp + 3 } else { mp - 9 };
558
+ let y = if m <= 2 { y + 1 } else { y };
559
+
560
+ if nanos == 0 {
561
+ Some(format!("{y:04}-{m:02}-{d:02}T{hour:02}:{min:02}:{sec:02}+00:00"))
562
+ } else {
563
+ // Include sub-second precision
564
+ let frac = nanos / 1_000_000; // milliseconds
565
+ Some(format!(
566
+ "{y:04}-{m:02}-{d:02}T{hour:02}:{min:02}:{sec:02}.{frac:03}+00:00"
567
+ ))
568
+ }
569
+ }
570
+
571
+ /// Read recipients from MSG __recip_version1.0_#XXXXXXXX substorages.
572
+ ///
573
+ /// Returns (to, cc, bcc) vectors. Each entry is formatted as `"Name" <email>` or just `email`.
574
+ fn read_msg_recipients<F: std::io::Read + std::io::Seek>(
575
+ comp: &mut cfb::CompoundFile<F>,
576
+ ) -> (Vec<String>, Vec<String>, Vec<String>) {
577
+ // Collect recipient storage paths
578
+ let recip_paths: Vec<String> = comp
579
+ .walk()
580
+ .filter(|e| e.is_storage() && e.name().starts_with("__recip_version1.0_"))
581
+ .map(|e| e.path().to_string_lossy().into_owned())
582
+ .collect();
583
+
584
+ let mut to_emails = Vec::new();
585
+ let mut cc_emails = Vec::new();
586
+ let mut bcc_emails = Vec::new();
587
+
588
+ for path in &recip_paths {
589
+ let display_name = read_msg_string_prop(comp, path, 0x3001); // PR_DISPLAY_NAME
590
+ let email_addr = read_msg_string_prop(comp, path, 0x39FE) // PR_SMTP_ADDRESS
591
+ .or_else(|| read_msg_string_prop(comp, path, 0x3003)) // PR_EMAIL_ADDRESS
592
+ .filter(|s| !s.is_empty());
593
+
594
+ let formatted = match (&display_name, &email_addr) {
595
+ (Some(name), Some(email)) if !name.is_empty() && name != email => {
596
+ format!("\"{}\" <{}>", name, email)
597
+ }
598
+ (_, Some(email)) => email.clone(),
599
+ (Some(name), None) if !name.is_empty() => name.clone(),
600
+ _ => continue,
601
+ };
602
+
603
+ // Read PR_RECIPIENT_TYPE from properties stream
604
+ let recip_type = read_msg_recip_type(comp, path);
605
+ match recip_type {
606
+ 1 => to_emails.push(formatted), // MAPI_TO
607
+ 2 => cc_emails.push(formatted), // MAPI_CC
608
+ 3 => bcc_emails.push(formatted), // MAPI_BCC
609
+ _ => to_emails.push(formatted), // Default to To
610
+ }
611
+ }
612
+
613
+ (to_emails, cc_emails, bcc_emails)
614
+ }
615
+
616
+ /// Read PR_RECIPIENT_TYPE (0x0C15) from a recipient's __properties_version1.0 stream.
617
+ /// Returns 1 (To), 2 (CC), 3 (BCC), or 0 if not found.
618
+ fn read_msg_recip_type<F: std::io::Read + std::io::Seek>(comp: &mut cfb::CompoundFile<F>, base: &str) -> u32 {
619
+ use std::io::Read;
620
+
621
+ let props_path = format!("{base}/__properties_version1.0");
622
+ let mut stream = match comp.open_stream(&props_path) {
623
+ Ok(s) => s,
624
+ Err(_) => return 0,
625
+ };
626
+ let mut buf = Vec::new();
627
+ if stream.read_to_end(&mut buf).is_err() {
628
+ return 0;
629
+ }
630
+
631
+ // Recipient properties have 8-byte header
632
+ let mut offset = 8;
633
+ while offset + 16 <= buf.len() {
634
+ // MAPI property entry: prop_type (2) + prop_id (2) + flags (4) + value (8)
635
+ let ptype = u16::from_le_bytes([buf[offset], buf[offset + 1]]);
636
+ let pid = u16::from_le_bytes([buf[offset + 2], buf[offset + 3]]);
637
+
638
+ if pid == 0x0C15 && ptype == 0x0003 {
639
+ // PT_LONG
640
+ return u32::from_le_bytes([buf[offset + 8], buf[offset + 9], buf[offset + 10], buf[offset + 11]]);
641
+ }
642
+ offset += 16;
643
+ }
644
+ 0
645
+ }
646
+
647
+ /// Extract the raw Date header value from email bytes.
648
+ ///
649
+ /// Scans for `Date:` in the header section (before the blank line that separates
650
+ /// headers from body) and returns the raw value, handling continuation lines.
651
+ fn extract_raw_date_header(data: &[u8]) -> Option<String> {
652
+ let text = std::str::from_utf8(data).ok()?;
653
+
654
+ // Find the end of headers (blank line)
655
+ let header_end = text
656
+ .find("\r\n\r\n")
657
+ .or_else(|| text.find("\n\n"))
658
+ .unwrap_or(text.len().min(8192)); // Cap scan to 8KB
659
+
660
+ let headers = &text[..header_end];
661
+
662
+ // Find Date: header (case-insensitive start, then exact field name)
663
+ let mut date_value = None;
664
+ for line in headers.lines() {
665
+ if let Some(val) = line.strip_prefix("Date:").or_else(|| line.strip_prefix("date:")) {
666
+ date_value = Some(val.trim().to_string());
667
+ } else if date_value.is_some() && (line.starts_with(' ') || line.starts_with('\t')) {
668
+ // Continuation line (folded header)
669
+ if let Some(ref mut dv) = date_value {
670
+ dv.push(' ');
671
+ dv.push_str(line.trim());
672
+ }
673
+ } else if date_value.is_some() {
674
+ break; // Next header field
675
+ }
676
+ }
677
+
678
+ date_value.filter(|s| !s.is_empty())
500
679
  }
501
680
 
502
681
  /// Extract email content from either .eml or .msg format
@@ -545,17 +724,8 @@ pub fn build_email_text_output(result: &EmailExtractionResult) -> String {
545
724
 
546
725
  text_parts.push(result.cleaned_text.clone());
547
726
 
548
- if !result.attachments.is_empty() {
549
- let mut attachment_names = Vec::with_capacity(result.attachments.len().min(20));
550
- for att in &result.attachments {
551
- if let Some(name) = att.name.as_ref().or(att.filename.as_ref()) {
552
- attachment_names.push(name.clone());
553
- }
554
- }
555
- if !attachment_names.is_empty() {
556
- text_parts.push(format!("Attachments: {}", attachment_names.join(", ")));
557
- }
558
- }
727
+ // Attachment names are stored in metadata but not included in the text output.
728
+ // This keeps the text output focused on message content.
559
729
 
560
730
  text_parts.join("\n")
561
731
  }
@@ -790,7 +960,9 @@ mod tests {
790
960
  };
791
961
 
792
962
  let output = build_email_text_output(&result);
793
- assert!(output.contains("Attachments: file.txt"));
963
+ // Attachment names are stored in metadata, not in text output
964
+ assert!(!output.contains("Attachments:"));
965
+ assert!(output.contains("Hello World"));
794
966
  }
795
967
 
796
968
  #[test]
@@ -1,15 +1,47 @@
1
1
  //! Text formatting utilities for RTF content.
2
2
 
3
- /// Normalize whitespace in a string using a single-pass algorithm.
3
+ /// Normalize whitespace in a string.
4
4
  ///
5
- /// Collapses multiple consecutive whitespace characters into single spaces
6
- /// and trims leading/trailing whitespace.
5
+ /// - Collapses multiple consecutive spaces/tabs into a single space
6
+ /// - Preserves single newlines (paragraph breaks from \par)
7
+ /// - Collapses multiple consecutive newlines into a double newline
8
+ /// - Trims leading/trailing whitespace from each line
9
+ /// - Trims leading/trailing blank lines
7
10
  pub fn normalize_whitespace(s: &str) -> String {
8
- let mut result = String::with_capacity(s.len());
9
- let mut last_was_space = false;
11
+ // Split into lines, trim each, collapse blank runs
12
+ let mut lines: Vec<&str> = Vec::new();
13
+ let mut last_blank = false;
14
+
15
+ for line in s.split('\n') {
16
+ // Collapse internal whitespace on each line
17
+ let trimmed = line.trim();
18
+ if trimmed.is_empty() {
19
+ if !last_blank && !lines.is_empty() {
20
+ lines.push("");
21
+ last_blank = true;
22
+ }
23
+ } else {
24
+ last_blank = false;
25
+ lines.push(trimmed);
26
+ }
27
+ }
10
28
 
11
- for ch in s.chars() {
12
- if ch.is_whitespace() {
29
+ // Trim trailing blank lines
30
+ while lines.last() == Some(&"") {
31
+ lines.pop();
32
+ }
33
+
34
+ // Join and collapse internal multi-spaces within each line
35
+ let joined = lines.join("\n");
36
+
37
+ // Collapse runs of spaces within lines
38
+ let mut result = String::with_capacity(joined.len());
39
+ let mut last_was_space = false;
40
+ for ch in joined.chars() {
41
+ if ch == '\n' {
42
+ result.push('\n');
43
+ last_was_space = false;
44
+ } else if ch == ' ' || ch == '\t' {
13
45
  if !last_was_space {
14
46
  result.push(' ');
15
47
  last_was_space = true;