kreuzberg 4.4.0 → 4.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/extraction/email.rs +215 -43
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +39 -7
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +148 -37
- data/vendor/kreuzberg/src/ocr/cache.rs +1 -0
- data/vendor/kreuzberg/src/ocr/conversion.rs +1 -1
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +210 -2
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +44 -10
- data/vendor/kreuzberg/src/types/formats.rs +17 -0
- data/vendor/kreuzberg/tests/ocr_table_inline.rs +277 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 56e750209fdd4c61b193bbc25ce1f7e7f3646cee22fcd7ee79af381aa1c95561
|
|
4
|
+
data.tar.gz: 2b741797f40b209ad5b8451aba4adae514914ee6f8dd86a55b8bbf7d5e910e98
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7110c61739f8a373080d03a017ab674713831df05acbb64e6f1c8d8fa6d7ca8e365f3f9cac3a0b47f046b7bd1778e4e01488142d8c1c1de355570363ce710210
|
|
7
|
+
data.tar.gz: d7a02f18c7e656475bb54081885eb6d82031bd76cc3f5515a68561191ccc5051216157924cfc36555da804afca90e21634780592163bdffad299cd4bc1a5fb0f
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.4.
|
|
4
|
+
kreuzberg (4.4.1)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -222,7 +222,7 @@ CHECKSUMS
|
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
223
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
224
224
|
json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
|
|
225
|
-
kreuzberg (4.4.
|
|
225
|
+
kreuzberg (4.4.1)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.1" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.4.
|
|
5
|
+
version = "4.4.1"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.4.
|
|
20
|
+
> **🚀 Version 4.4.1 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -43,11 +43,11 @@ fn html_tag_regex() -> &'static Regex {
|
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
fn script_regex() -> &'static Regex {
|
|
46
|
-
SCRIPT_RE.get_or_init(|| Regex::new(r"(?
|
|
46
|
+
SCRIPT_RE.get_or_init(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap())
|
|
47
47
|
}
|
|
48
48
|
|
|
49
49
|
fn style_regex() -> &'static Regex {
|
|
50
|
-
STYLE_RE.get_or_init(|| Regex::new(r"(?
|
|
50
|
+
STYLE_RE.get_or_init(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap())
|
|
51
51
|
}
|
|
52
52
|
|
|
53
53
|
fn whitespace_regex() -> &'static Regex {
|
|
@@ -166,16 +166,22 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
|
|
|
166
166
|
})
|
|
167
167
|
.unwrap_or_else(Vec::new);
|
|
168
168
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
169
|
+
// Extract date: prefer the raw Date header text (preserves original format),
|
|
170
|
+
// falling back to mail_parser's parsed DateTime → RFC 3339.
|
|
171
|
+
// mail_parser parses standard RFC 2822 dates into HeaderValue::DateTime,
|
|
172
|
+
// losing the original string. For non-standard dates (ISO 8601, invalid strings),
|
|
173
|
+
// it may produce garbled output. We extract the raw header from the email bytes.
|
|
174
|
+
let date = extract_raw_date_header(&data).or_else(|| {
|
|
175
|
+
message.date().and_then(|d| {
|
|
176
|
+
let rfc3339 = d.to_rfc3339();
|
|
177
|
+
// Reject obviously garbled dates (year 2000, month 0)
|
|
178
|
+
if rfc3339.starts_with("2000-00") || rfc3339.starts_with("0000-") {
|
|
175
179
|
None
|
|
180
|
+
} else {
|
|
181
|
+
Some(rfc3339)
|
|
176
182
|
}
|
|
177
183
|
})
|
|
178
|
-
|
|
184
|
+
});
|
|
179
185
|
|
|
180
186
|
let message_id = message.message_id().map(|id| id.to_string());
|
|
181
187
|
|
|
@@ -336,25 +342,25 @@ fn extract_msg_from_cfb<F: std::io::Read + std::io::Seek>(
|
|
|
336
342
|
Some(name) if !name.is_empty() => format!("\"{}\" <{}>", name, email),
|
|
337
343
|
_ => email.clone(),
|
|
338
344
|
});
|
|
339
|
-
let display_to = read_msg_string_prop(comp, "", 0x0E04); // PR_DISPLAY_TO
|
|
340
|
-
let display_cc = read_msg_string_prop(comp, "", 0x0E03); // PR_DISPLAY_CC
|
|
341
|
-
let display_bcc = read_msg_string_prop(comp, "", 0x0E02); // PR_DISPLAY_BCC
|
|
342
345
|
let body = read_msg_string_prop(comp, "", 0x1000); // PR_BODY
|
|
343
346
|
let html_body = read_msg_string_prop(comp, "", 0x1013); // PR_BODY_HTML
|
|
344
347
|
let message_id = read_msg_string_prop(comp, "", 0x1035) // PR_INTERNET_MESSAGE_ID
|
|
345
348
|
.filter(|s| !s.is_empty());
|
|
346
|
-
let headers = read_msg_string_prop(comp, "", 0x007D); // PR_TRANSPORT_MESSAGE_HEADERS
|
|
347
349
|
|
|
348
|
-
//
|
|
349
|
-
let date =
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
350
|
+
// --- date: prefer PR_CLIENT_SUBMIT_TIME, fall back to transport headers ---
|
|
351
|
+
let date = read_msg_filetime_prop(comp, "", 0x0039) // PR_CLIENT_SUBMIT_TIME
|
|
352
|
+
.or_else(|| read_msg_filetime_prop(comp, "", 0x0E06)) // PR_MESSAGE_DELIVERY_TIME
|
|
353
|
+
.or_else(|| {
|
|
354
|
+
let headers = read_msg_string_prop(comp, "", 0x007D); // PR_TRANSPORT_MESSAGE_HEADERS
|
|
355
|
+
headers.as_ref().and_then(|h| {
|
|
356
|
+
h.lines()
|
|
357
|
+
.find(|line| line.starts_with("Date:"))
|
|
358
|
+
.map(|line| line.trim_start_matches("Date:").trim().to_string())
|
|
359
|
+
})
|
|
360
|
+
});
|
|
354
361
|
|
|
355
|
-
|
|
356
|
-
let cc_emails =
|
|
357
|
-
let bcc_emails = split_display_addresses(&display_bcc);
|
|
362
|
+
// --- recipients: read from substorages for full email addresses -----------
|
|
363
|
+
let (to_emails, cc_emails, bcc_emails) = read_msg_recipients(comp);
|
|
358
364
|
|
|
359
365
|
let plain_text = body.filter(|s| !s.is_empty());
|
|
360
366
|
let html_content = html_body.filter(|s| !s.is_empty());
|
|
@@ -488,15 +494,188 @@ fn decode_utf16le_bytes(data: &[u8]) -> String {
|
|
|
488
494
|
String::from_utf16_lossy(&u16s).trim_end_matches('\0').to_string()
|
|
489
495
|
}
|
|
490
496
|
|
|
491
|
-
///
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
497
|
+
/// Read a PT_SYSTIME (FILETIME) property from the __properties_version1.0 stream
|
|
498
|
+
/// and convert it to an ISO 8601 date string.
|
|
499
|
+
///
|
|
500
|
+
/// FILETIME is a 64-bit value representing 100-nanosecond intervals since 1601-01-01.
|
|
501
|
+
fn read_msg_filetime_prop<F: std::io::Read + std::io::Seek>(
|
|
502
|
+
comp: &mut cfb::CompoundFile<F>,
|
|
503
|
+
base: &str,
|
|
504
|
+
prop_id: u16,
|
|
505
|
+
) -> Option<String> {
|
|
506
|
+
use std::io::Read;
|
|
507
|
+
|
|
508
|
+
let props_path = format!("{base}/__properties_version1.0");
|
|
509
|
+
let mut stream = comp.open_stream(&props_path).ok()?;
|
|
510
|
+
let mut buf = Vec::new();
|
|
511
|
+
stream.read_to_end(&mut buf).ok()?;
|
|
512
|
+
|
|
513
|
+
// Message-level properties have a 32-byte header; recipient/attachment have 8-byte.
|
|
514
|
+
let header_size: usize = if base.is_empty() { 32 } else { 8 };
|
|
515
|
+
let mut offset = header_size;
|
|
516
|
+
|
|
517
|
+
while offset + 16 <= buf.len() {
|
|
518
|
+
// MAPI property entry: prop_type (2) + prop_id (2) + flags (4) + value (8)
|
|
519
|
+
let ptype = u16::from_le_bytes([buf[offset], buf[offset + 1]]);
|
|
520
|
+
let pid = u16::from_le_bytes([buf[offset + 2], buf[offset + 3]]);
|
|
521
|
+
|
|
522
|
+
if pid == prop_id && ptype == 0x0040 {
|
|
523
|
+
// PT_SYSTIME
|
|
524
|
+
let filetime = u64::from_le_bytes(buf[offset + 8..offset + 16].try_into().ok()?);
|
|
525
|
+
return filetime_to_iso8601(filetime);
|
|
526
|
+
}
|
|
527
|
+
offset += 16;
|
|
528
|
+
}
|
|
529
|
+
None
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
/// Convert a Windows FILETIME (100-ns intervals since 1601-01-01) to ISO 8601.
|
|
533
|
+
fn filetime_to_iso8601(filetime: u64) -> Option<String> {
|
|
534
|
+
// Epoch offset: difference between 1601-01-01 and 1970-01-01 in 100-ns intervals
|
|
535
|
+
const EPOCH_DIFF: u64 = 116_444_736_000_000_000;
|
|
536
|
+
if filetime < EPOCH_DIFF {
|
|
537
|
+
return None;
|
|
538
|
+
}
|
|
539
|
+
let hundred_ns = filetime - EPOCH_DIFF;
|
|
540
|
+
let secs = (hundred_ns / 10_000_000) as i64;
|
|
541
|
+
let nanos = ((hundred_ns % 10_000_000) * 100) as u32;
|
|
542
|
+
|
|
543
|
+
// Format manually to avoid pulling in chrono
|
|
544
|
+
let days_since_epoch = secs / 86400;
|
|
545
|
+
let time_of_day = secs % 86400;
|
|
546
|
+
let (hour, min, sec) = (time_of_day / 3600, (time_of_day % 3600) / 60, time_of_day % 60);
|
|
547
|
+
|
|
548
|
+
// Civil date calculation from days since 1970-01-01 (algorithm from Howard Hinnant)
|
|
549
|
+
let z = days_since_epoch + 719468;
|
|
550
|
+
let era = (if z >= 0 { z } else { z - 146096 }) / 146097;
|
|
551
|
+
let doe = z - era * 146097;
|
|
552
|
+
let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
|
|
553
|
+
let y = yoe + era * 400;
|
|
554
|
+
let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
|
|
555
|
+
let mp = (5 * doy + 2) / 153;
|
|
556
|
+
let d = doy - (153 * mp + 2) / 5 + 1;
|
|
557
|
+
let m = if mp < 10 { mp + 3 } else { mp - 9 };
|
|
558
|
+
let y = if m <= 2 { y + 1 } else { y };
|
|
559
|
+
|
|
560
|
+
if nanos == 0 {
|
|
561
|
+
Some(format!("{y:04}-{m:02}-{d:02}T{hour:02}:{min:02}:{sec:02}+00:00"))
|
|
562
|
+
} else {
|
|
563
|
+
// Include sub-second precision
|
|
564
|
+
let frac = nanos / 1_000_000; // milliseconds
|
|
565
|
+
Some(format!(
|
|
566
|
+
"{y:04}-{m:02}-{d:02}T{hour:02}:{min:02}:{sec:02}.{frac:03}+00:00"
|
|
567
|
+
))
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
/// Read recipients from MSG __recip_version1.0_#XXXXXXXX substorages.
|
|
572
|
+
///
|
|
573
|
+
/// Returns (to, cc, bcc) vectors. Each entry is formatted as `"Name" <email>` or just `email`.
|
|
574
|
+
fn read_msg_recipients<F: std::io::Read + std::io::Seek>(
|
|
575
|
+
comp: &mut cfb::CompoundFile<F>,
|
|
576
|
+
) -> (Vec<String>, Vec<String>, Vec<String>) {
|
|
577
|
+
// Collect recipient storage paths
|
|
578
|
+
let recip_paths: Vec<String> = comp
|
|
579
|
+
.walk()
|
|
580
|
+
.filter(|e| e.is_storage() && e.name().starts_with("__recip_version1.0_"))
|
|
581
|
+
.map(|e| e.path().to_string_lossy().into_owned())
|
|
582
|
+
.collect();
|
|
583
|
+
|
|
584
|
+
let mut to_emails = Vec::new();
|
|
585
|
+
let mut cc_emails = Vec::new();
|
|
586
|
+
let mut bcc_emails = Vec::new();
|
|
587
|
+
|
|
588
|
+
for path in &recip_paths {
|
|
589
|
+
let display_name = read_msg_string_prop(comp, path, 0x3001); // PR_DISPLAY_NAME
|
|
590
|
+
let email_addr = read_msg_string_prop(comp, path, 0x39FE) // PR_SMTP_ADDRESS
|
|
591
|
+
.or_else(|| read_msg_string_prop(comp, path, 0x3003)) // PR_EMAIL_ADDRESS
|
|
592
|
+
.filter(|s| !s.is_empty());
|
|
593
|
+
|
|
594
|
+
let formatted = match (&display_name, &email_addr) {
|
|
595
|
+
(Some(name), Some(email)) if !name.is_empty() && name != email => {
|
|
596
|
+
format!("\"{}\" <{}>", name, email)
|
|
597
|
+
}
|
|
598
|
+
(_, Some(email)) => email.clone(),
|
|
599
|
+
(Some(name), None) if !name.is_empty() => name.clone(),
|
|
600
|
+
_ => continue,
|
|
601
|
+
};
|
|
602
|
+
|
|
603
|
+
// Read PR_RECIPIENT_TYPE from properties stream
|
|
604
|
+
let recip_type = read_msg_recip_type(comp, path);
|
|
605
|
+
match recip_type {
|
|
606
|
+
1 => to_emails.push(formatted), // MAPI_TO
|
|
607
|
+
2 => cc_emails.push(formatted), // MAPI_CC
|
|
608
|
+
3 => bcc_emails.push(formatted), // MAPI_BCC
|
|
609
|
+
_ => to_emails.push(formatted), // Default to To
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
(to_emails, cc_emails, bcc_emails)
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
/// Read PR_RECIPIENT_TYPE (0x0C15) from a recipient's __properties_version1.0 stream.
|
|
617
|
+
/// Returns 1 (To), 2 (CC), 3 (BCC), or 0 if not found.
|
|
618
|
+
fn read_msg_recip_type<F: std::io::Read + std::io::Seek>(comp: &mut cfb::CompoundFile<F>, base: &str) -> u32 {
|
|
619
|
+
use std::io::Read;
|
|
620
|
+
|
|
621
|
+
let props_path = format!("{base}/__properties_version1.0");
|
|
622
|
+
let mut stream = match comp.open_stream(&props_path) {
|
|
623
|
+
Ok(s) => s,
|
|
624
|
+
Err(_) => return 0,
|
|
625
|
+
};
|
|
626
|
+
let mut buf = Vec::new();
|
|
627
|
+
if stream.read_to_end(&mut buf).is_err() {
|
|
628
|
+
return 0;
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
// Recipient properties have 8-byte header
|
|
632
|
+
let mut offset = 8;
|
|
633
|
+
while offset + 16 <= buf.len() {
|
|
634
|
+
// MAPI property entry: prop_type (2) + prop_id (2) + flags (4) + value (8)
|
|
635
|
+
let ptype = u16::from_le_bytes([buf[offset], buf[offset + 1]]);
|
|
636
|
+
let pid = u16::from_le_bytes([buf[offset + 2], buf[offset + 3]]);
|
|
637
|
+
|
|
638
|
+
if pid == 0x0C15 && ptype == 0x0003 {
|
|
639
|
+
// PT_LONG
|
|
640
|
+
return u32::from_le_bytes([buf[offset + 8], buf[offset + 9], buf[offset + 10], buf[offset + 11]]);
|
|
641
|
+
}
|
|
642
|
+
offset += 16;
|
|
643
|
+
}
|
|
644
|
+
0
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
/// Extract the raw Date header value from email bytes.
|
|
648
|
+
///
|
|
649
|
+
/// Scans for `Date:` in the header section (before the blank line that separates
|
|
650
|
+
/// headers from body) and returns the raw value, handling continuation lines.
|
|
651
|
+
fn extract_raw_date_header(data: &[u8]) -> Option<String> {
|
|
652
|
+
let text = std::str::from_utf8(data).ok()?;
|
|
653
|
+
|
|
654
|
+
// Find the end of headers (blank line)
|
|
655
|
+
let header_end = text
|
|
656
|
+
.find("\r\n\r\n")
|
|
657
|
+
.or_else(|| text.find("\n\n"))
|
|
658
|
+
.unwrap_or(text.len().min(8192)); // Cap scan to 8KB
|
|
659
|
+
|
|
660
|
+
let headers = &text[..header_end];
|
|
661
|
+
|
|
662
|
+
// Find Date: header (case-insensitive start, then exact field name)
|
|
663
|
+
let mut date_value = None;
|
|
664
|
+
for line in headers.lines() {
|
|
665
|
+
if let Some(val) = line.strip_prefix("Date:").or_else(|| line.strip_prefix("date:")) {
|
|
666
|
+
date_value = Some(val.trim().to_string());
|
|
667
|
+
} else if date_value.is_some() && (line.starts_with(' ') || line.starts_with('\t')) {
|
|
668
|
+
// Continuation line (folded header)
|
|
669
|
+
if let Some(ref mut dv) = date_value {
|
|
670
|
+
dv.push(' ');
|
|
671
|
+
dv.push_str(line.trim());
|
|
672
|
+
}
|
|
673
|
+
} else if date_value.is_some() {
|
|
674
|
+
break; // Next header field
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
date_value.filter(|s| !s.is_empty())
|
|
500
679
|
}
|
|
501
680
|
|
|
502
681
|
/// Extract email content from either .eml or .msg format
|
|
@@ -545,17 +724,8 @@ pub fn build_email_text_output(result: &EmailExtractionResult) -> String {
|
|
|
545
724
|
|
|
546
725
|
text_parts.push(result.cleaned_text.clone());
|
|
547
726
|
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
for att in &result.attachments {
|
|
551
|
-
if let Some(name) = att.name.as_ref().or(att.filename.as_ref()) {
|
|
552
|
-
attachment_names.push(name.clone());
|
|
553
|
-
}
|
|
554
|
-
}
|
|
555
|
-
if !attachment_names.is_empty() {
|
|
556
|
-
text_parts.push(format!("Attachments: {}", attachment_names.join(", ")));
|
|
557
|
-
}
|
|
558
|
-
}
|
|
727
|
+
// Attachment names are stored in metadata but not included in the text output.
|
|
728
|
+
// This keeps the text output focused on message content.
|
|
559
729
|
|
|
560
730
|
text_parts.join("\n")
|
|
561
731
|
}
|
|
@@ -790,7 +960,9 @@ mod tests {
|
|
|
790
960
|
};
|
|
791
961
|
|
|
792
962
|
let output = build_email_text_output(&result);
|
|
793
|
-
|
|
963
|
+
// Attachment names are stored in metadata, not in text output
|
|
964
|
+
assert!(!output.contains("Attachments:"));
|
|
965
|
+
assert!(output.contains("Hello World"));
|
|
794
966
|
}
|
|
795
967
|
|
|
796
968
|
#[test]
|
|
@@ -1,15 +1,47 @@
|
|
|
1
1
|
//! Text formatting utilities for RTF content.
|
|
2
2
|
|
|
3
|
-
/// Normalize whitespace in a string
|
|
3
|
+
/// Normalize whitespace in a string.
|
|
4
4
|
///
|
|
5
|
-
/// Collapses multiple consecutive
|
|
6
|
-
///
|
|
5
|
+
/// - Collapses multiple consecutive spaces/tabs into a single space
|
|
6
|
+
/// - Preserves single newlines (paragraph breaks from \par)
|
|
7
|
+
/// - Collapses multiple consecutive newlines into a double newline
|
|
8
|
+
/// - Trims leading/trailing whitespace from each line
|
|
9
|
+
/// - Trims leading/trailing blank lines
|
|
7
10
|
pub fn normalize_whitespace(s: &str) -> String {
|
|
8
|
-
|
|
9
|
-
let mut
|
|
11
|
+
// Split into lines, trim each, collapse blank runs
|
|
12
|
+
let mut lines: Vec<&str> = Vec::new();
|
|
13
|
+
let mut last_blank = false;
|
|
14
|
+
|
|
15
|
+
for line in s.split('\n') {
|
|
16
|
+
// Collapse internal whitespace on each line
|
|
17
|
+
let trimmed = line.trim();
|
|
18
|
+
if trimmed.is_empty() {
|
|
19
|
+
if !last_blank && !lines.is_empty() {
|
|
20
|
+
lines.push("");
|
|
21
|
+
last_blank = true;
|
|
22
|
+
}
|
|
23
|
+
} else {
|
|
24
|
+
last_blank = false;
|
|
25
|
+
lines.push(trimmed);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
10
28
|
|
|
11
|
-
|
|
12
|
-
|
|
29
|
+
// Trim trailing blank lines
|
|
30
|
+
while lines.last() == Some(&"") {
|
|
31
|
+
lines.pop();
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Join and collapse internal multi-spaces within each line
|
|
35
|
+
let joined = lines.join("\n");
|
|
36
|
+
|
|
37
|
+
// Collapse runs of spaces within lines
|
|
38
|
+
let mut result = String::with_capacity(joined.len());
|
|
39
|
+
let mut last_was_space = false;
|
|
40
|
+
for ch in joined.chars() {
|
|
41
|
+
if ch == '\n' {
|
|
42
|
+
result.push('\n');
|
|
43
|
+
last_was_space = false;
|
|
44
|
+
} else if ch == ' ' || ch == '\t' {
|
|
13
45
|
if !last_was_space {
|
|
14
46
|
result.push(' ');
|
|
15
47
|
last_was_space = true;
|