kreuzberg 4.2.13 → 4.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +2 -2
- data/vendor/kreuzberg/src/core/mime.rs +1 -1
- data/vendor/kreuzberg/src/extraction/archive/gzip.rs +38 -0
- data/vendor/kreuzberg/src/extraction/archive/mod.rs +122 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +99 -13
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +167 -34
- data/vendor/kreuzberg/src/pdf/bindings.rs +6 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +9 -0
- data/vendor/kreuzberg/src/pdf/text.rs +8 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: da61e06dfa4643e485c13636998888f03699816b7462087c9df6c9639d53fc45
|
|
4
|
+
data.tar.gz: 20a9c88f3eac809d2d158e15ea3747c425d47b3af0e2bf93825c831c9aa11aa9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7be55db6494d45de03b3fee1271e1bc151193709098bdfe94fb7a5fb33159dd9a0b8b08fffd5ed2d3b24f3f3766c0bb1e81b42319d25b529088afe7e6a4c52d6
|
|
7
|
+
data.tar.gz: bd94796f90094ca64775c0ded247bc216fdfe6ee50d4c6258685ccc5b5b33e1a69546cd892ac7dce77eee72123cbaf6ce239ee932a117c8ad5dfe801a9a548bf
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.14)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -209,7 +209,7 @@ CHECKSUMS
|
|
|
209
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
210
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
211
211
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
212
|
-
kreuzberg (4.2.
|
|
212
|
+
kreuzberg (4.2.14)
|
|
213
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
214
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
215
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.14" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.14 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
//! This module provides shared utilities used across extraction modules.
|
|
4
4
|
|
|
5
5
|
use crate::plugins::DocumentExtractor;
|
|
6
|
-
#[cfg(feature = "office")]
|
|
6
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
7
7
|
use crate::utils::intern_mime_type;
|
|
8
8
|
use crate::utils::{PoolSizeHint, estimate_pool_size};
|
|
9
9
|
use crate::{KreuzbergError, Result};
|
|
@@ -65,7 +65,7 @@ pub fn get_pool_sizing_hint(file_size: u64, mime_type: &str) -> PoolSizeHint {
|
|
|
65
65
|
///
|
|
66
66
|
/// For pre-interned MIME types (all common types), this is O(1) pointer dereference.
|
|
67
67
|
/// For unknown MIME types, this allocates once per unique type and caches the result.
|
|
68
|
-
#[cfg(feature = "office")]
|
|
68
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
69
69
|
pub(in crate::core::extractor) fn pool_mime_type(mime_type: &str) -> String {
|
|
70
70
|
intern_mime_type(mime_type).to_string()
|
|
71
71
|
}
|
|
@@ -104,7 +104,7 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
|
|
|
104
104
|
m.insert("zip", "application/zip");
|
|
105
105
|
m.insert("tar", "application/x-tar");
|
|
106
106
|
m.insert("gz", "application/gzip");
|
|
107
|
-
m.insert("tgz", "application/
|
|
107
|
+
m.insert("tgz", "application/gzip");
|
|
108
108
|
m.insert("7z", "application/x-7z-compressed");
|
|
109
109
|
|
|
110
110
|
m.insert("rst", "text/x-rst");
|
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
//!
|
|
3
3
|
//! Provides functions for decompressing gzip files and extracting
|
|
4
4
|
//! metadata and text content from the compressed data.
|
|
5
|
+
//!
|
|
6
|
+
//! When a gzip file contains a TAR archive (e.g., .tar.gz files),
|
|
7
|
+
//! this module automatically detects the TAR format and delegates
|
|
8
|
+
//! to the TAR extraction functions.
|
|
5
9
|
|
|
6
10
|
use super::{ArchiveEntry, ArchiveMetadata};
|
|
7
11
|
use crate::error::{KreuzbergError, Result};
|
|
@@ -10,6 +14,14 @@ use flate2::read::GzDecoder;
|
|
|
10
14
|
use std::collections::HashMap;
|
|
11
15
|
use std::io::Read;
|
|
12
16
|
|
|
17
|
+
/// Check if data looks like a TAR archive (has "ustar" magic at offset 257).
|
|
18
|
+
///
|
|
19
|
+
/// The TAR format has a standard USTAR header starting at offset 257,
|
|
20
|
+
/// which helps identify TAR archives that have been gzip-compressed.
|
|
21
|
+
fn is_tar_archive(data: &[u8]) -> bool {
|
|
22
|
+
data.len() > 262 && &data[257..262] == b"ustar"
|
|
23
|
+
}
|
|
24
|
+
|
|
13
25
|
/// Decompress gzip bytes with a size limit to prevent decompression bombs.
|
|
14
26
|
fn decompress_gzip_limited(bytes: &[u8], max_size: u64) -> Result<Vec<u8>> {
|
|
15
27
|
let decoder = GzDecoder::new(bytes);
|
|
@@ -38,9 +50,19 @@ pub fn decompress_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<Vec<u8>>
|
|
|
38
50
|
///
|
|
39
51
|
/// This avoids the overhead of decompressing the data multiple times when both
|
|
40
52
|
/// metadata and text content are needed.
|
|
53
|
+
///
|
|
54
|
+
/// If the decompressed data is a TAR archive, delegates to TAR extraction functions.
|
|
41
55
|
pub fn extract_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<(ArchiveMetadata, HashMap<String, String>)> {
|
|
42
56
|
let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
|
|
43
57
|
|
|
58
|
+
// Check if the decompressed data is a TAR archive
|
|
59
|
+
if is_tar_archive(&decompressed) {
|
|
60
|
+
let mut metadata = super::tar::extract_tar_metadata(&decompressed, limits)?;
|
|
61
|
+
metadata.format = "GZIP+TAR".to_string();
|
|
62
|
+
let contents = super::tar::extract_tar_text_content(&decompressed, limits)?;
|
|
63
|
+
return Ok((metadata, contents));
|
|
64
|
+
}
|
|
65
|
+
|
|
44
66
|
// Re-read header for filename (lightweight - no decompression)
|
|
45
67
|
let mut decoder = GzDecoder::new(bytes);
|
|
46
68
|
let mut _discard = [0u8; 1];
|
|
@@ -77,9 +99,18 @@ pub fn extract_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<(ArchiveMet
|
|
|
77
99
|
///
|
|
78
100
|
/// Gzip wraps a single stream, so the metadata contains one entry
|
|
79
101
|
/// with the original filename (from gzip header) and decompressed size.
|
|
102
|
+
///
|
|
103
|
+
/// If the decompressed data is a TAR archive, delegates to TAR extraction.
|
|
80
104
|
pub fn extract_gzip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
|
|
81
105
|
let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
|
|
82
106
|
|
|
107
|
+
// Check if the decompressed data is a TAR archive
|
|
108
|
+
if is_tar_archive(&decompressed) {
|
|
109
|
+
let mut metadata = super::tar::extract_tar_metadata(&decompressed, limits)?;
|
|
110
|
+
metadata.format = "GZIP+TAR".to_string();
|
|
111
|
+
return Ok(metadata);
|
|
112
|
+
}
|
|
113
|
+
|
|
83
114
|
let mut decoder = GzDecoder::new(bytes);
|
|
84
115
|
let mut _discard = [0u8; 1];
|
|
85
116
|
let _ = decoder.read(&mut _discard);
|
|
@@ -107,9 +138,16 @@ pub fn extract_gzip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<Ar
|
|
|
107
138
|
/// Extract text content from a gzip-compressed file.
|
|
108
139
|
///
|
|
109
140
|
/// Decompresses and attempts to read the result as UTF-8 text.
|
|
141
|
+
///
|
|
142
|
+
/// If the decompressed data is a TAR archive, delegates to TAR extraction.
|
|
110
143
|
pub fn extract_gzip_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
|
|
111
144
|
let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
|
|
112
145
|
|
|
146
|
+
// Check if the decompressed data is a TAR archive
|
|
147
|
+
if is_tar_archive(&decompressed) {
|
|
148
|
+
return super::tar::extract_tar_text_content(&decompressed, limits);
|
|
149
|
+
}
|
|
150
|
+
|
|
113
151
|
let mut decoder = GzDecoder::new(bytes);
|
|
114
152
|
let mut _discard = [0u8; 1];
|
|
115
153
|
let _ = decoder.read(&mut _discard);
|
|
@@ -878,4 +878,126 @@ mod tests {
|
|
|
878
878
|
let result = extract_gzip_metadata(&compressed, &limits);
|
|
879
879
|
assert!(result.is_err());
|
|
880
880
|
}
|
|
881
|
+
|
|
882
|
+
#[test]
|
|
883
|
+
fn test_extract_gzip_compressed_tar_metadata() {
|
|
884
|
+
use flate2::Compression;
|
|
885
|
+
use flate2::write::GzEncoder;
|
|
886
|
+
use std::io::Write;
|
|
887
|
+
|
|
888
|
+
// Create a tar archive
|
|
889
|
+
let mut tar_data = Vec::new();
|
|
890
|
+
{
|
|
891
|
+
let mut tar = TarBuilder::new(&mut tar_data);
|
|
892
|
+
|
|
893
|
+
let data1 = b"Hello from tar.gz!";
|
|
894
|
+
let mut header1 = ::tar::Header::new_gnu();
|
|
895
|
+
header1.set_path("test.txt").unwrap();
|
|
896
|
+
header1.set_size(data1.len() as u64);
|
|
897
|
+
header1.set_cksum();
|
|
898
|
+
tar.append(&header1, &data1[..]).unwrap();
|
|
899
|
+
|
|
900
|
+
let data2 = b"# Markdown file";
|
|
901
|
+
let mut header2 = ::tar::Header::new_gnu();
|
|
902
|
+
header2.set_path("readme.md").unwrap();
|
|
903
|
+
header2.set_size(data2.len() as u64);
|
|
904
|
+
header2.set_cksum();
|
|
905
|
+
tar.append(&header2, &data2[..]).unwrap();
|
|
906
|
+
|
|
907
|
+
tar.finish().unwrap();
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
// Gzip compress the tar data
|
|
911
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
912
|
+
encoder.write_all(&tar_data).unwrap();
|
|
913
|
+
let gzip_compressed = encoder.finish().unwrap();
|
|
914
|
+
|
|
915
|
+
// Extract metadata from the gzip-compressed tar
|
|
916
|
+
let metadata = extract_gzip_metadata(&gzip_compressed, &default_limits()).unwrap();
|
|
917
|
+
|
|
918
|
+
assert_eq!(metadata.format, "GZIP+TAR");
|
|
919
|
+
assert_eq!(metadata.file_count, 2);
|
|
920
|
+
assert_eq!(metadata.file_list.len(), 2);
|
|
921
|
+
assert!(metadata.total_size > 0);
|
|
922
|
+
|
|
923
|
+
// Verify file paths are preserved
|
|
924
|
+
let paths: Vec<&str> = metadata.file_list.iter().map(|e| e.path.as_str()).collect();
|
|
925
|
+
assert!(paths.contains(&"test.txt"));
|
|
926
|
+
assert!(paths.contains(&"readme.md"));
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
#[test]
|
|
930
|
+
fn test_extract_gzip_compressed_tar_text_content() {
|
|
931
|
+
use flate2::Compression;
|
|
932
|
+
use flate2::write::GzEncoder;
|
|
933
|
+
use std::io::Write;
|
|
934
|
+
|
|
935
|
+
// Create a tar archive
|
|
936
|
+
let mut tar_data = Vec::new();
|
|
937
|
+
{
|
|
938
|
+
let mut tar = TarBuilder::new(&mut tar_data);
|
|
939
|
+
|
|
940
|
+
let data1 = b"Hello from tar.gz!";
|
|
941
|
+
let mut header1 = ::tar::Header::new_gnu();
|
|
942
|
+
header1.set_path("test.txt").unwrap();
|
|
943
|
+
header1.set_size(data1.len() as u64);
|
|
944
|
+
header1.set_cksum();
|
|
945
|
+
tar.append(&header1, &data1[..]).unwrap();
|
|
946
|
+
|
|
947
|
+
let data2 = b"# Markdown content";
|
|
948
|
+
let mut header2 = ::tar::Header::new_gnu();
|
|
949
|
+
header2.set_path("readme.md").unwrap();
|
|
950
|
+
header2.set_size(data2.len() as u64);
|
|
951
|
+
header2.set_cksum();
|
|
952
|
+
tar.append(&header2, &data2[..]).unwrap();
|
|
953
|
+
|
|
954
|
+
tar.finish().unwrap();
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
// Gzip compress the tar data
|
|
958
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
959
|
+
encoder.write_all(&tar_data).unwrap();
|
|
960
|
+
let gzip_compressed = encoder.finish().unwrap();
|
|
961
|
+
|
|
962
|
+
// Extract text content from the gzip-compressed tar
|
|
963
|
+
let contents = extract_gzip_text_content(&gzip_compressed, &default_limits()).unwrap();
|
|
964
|
+
|
|
965
|
+
assert_eq!(contents.len(), 2);
|
|
966
|
+
assert_eq!(contents.get("test.txt").unwrap(), "Hello from tar.gz!");
|
|
967
|
+
assert_eq!(contents.get("readme.md").unwrap(), "# Markdown content");
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
#[test]
|
|
971
|
+
fn test_extract_gzip_compressed_tar_both() {
|
|
972
|
+
use flate2::Compression;
|
|
973
|
+
use flate2::write::GzEncoder;
|
|
974
|
+
use std::io::Write;
|
|
975
|
+
|
|
976
|
+
// Create a tar archive
|
|
977
|
+
let mut tar_data = Vec::new();
|
|
978
|
+
{
|
|
979
|
+
let mut tar = TarBuilder::new(&mut tar_data);
|
|
980
|
+
|
|
981
|
+
let data = b"Combined test content";
|
|
982
|
+
let mut header = ::tar::Header::new_gnu();
|
|
983
|
+
header.set_path("combined.txt").unwrap();
|
|
984
|
+
header.set_size(data.len() as u64);
|
|
985
|
+
header.set_cksum();
|
|
986
|
+
tar.append(&header, &data[..]).unwrap();
|
|
987
|
+
|
|
988
|
+
tar.finish().unwrap();
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
// Gzip compress the tar data
|
|
992
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
993
|
+
encoder.write_all(&tar_data).unwrap();
|
|
994
|
+
let gzip_compressed = encoder.finish().unwrap();
|
|
995
|
+
|
|
996
|
+
// Extract both metadata and content in one call
|
|
997
|
+
let (metadata, contents) = extract_gzip(&gzip_compressed, &default_limits()).unwrap();
|
|
998
|
+
|
|
999
|
+
assert_eq!(metadata.format, "GZIP+TAR");
|
|
1000
|
+
assert_eq!(metadata.file_count, 1);
|
|
1001
|
+
assert_eq!(contents.get("combined.txt").unwrap(), "Combined test content");
|
|
1002
|
+
}
|
|
881
1003
|
}
|
|
@@ -68,18 +68,63 @@ pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
|
|
|
68
68
|
#[cfg(not(feature = "office"))]
|
|
69
69
|
let office_metadata: Option<HashMap<String, String>> = None;
|
|
70
70
|
|
|
71
|
-
// For XLSX files, use specialized handler with OOM protection
|
|
72
|
-
if lower_path.ends_with(".xlsx")
|
|
73
|
-
|| lower_path.ends_with(".xlsm")
|
|
74
|
-
|| lower_path.ends_with(".xlam")
|
|
75
|
-
|| lower_path.ends_with(".xltm")
|
|
76
|
-
{
|
|
71
|
+
// For standard XLSX-format files, use specialized handler with OOM protection
|
|
72
|
+
if lower_path.ends_with(".xlsx") || lower_path.ends_with(".xlsm") || lower_path.ends_with(".xltm") {
|
|
77
73
|
let file = std::fs::File::open(file_path)?;
|
|
78
74
|
let workbook = calamine::Xlsx::new(std::io::BufReader::new(file))
|
|
79
75
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
|
|
80
76
|
return process_xlsx_workbook(workbook, office_metadata);
|
|
81
77
|
}
|
|
82
78
|
|
|
79
|
+
// For .xlam (Excel add-in), try XLSX parsing but gracefully return empty workbook on failure
|
|
80
|
+
if lower_path.ends_with(".xlam") {
|
|
81
|
+
let file = std::fs::File::open(file_path)?;
|
|
82
|
+
match calamine::Xlsx::new(std::io::BufReader::new(file)) {
|
|
83
|
+
Ok(workbook) => {
|
|
84
|
+
return process_xlsx_workbook(workbook, office_metadata);
|
|
85
|
+
}
|
|
86
|
+
Err(_) => {
|
|
87
|
+
// .xlam files may not contain proper workbook data - return empty workbook
|
|
88
|
+
return Ok(ExcelWorkbook {
|
|
89
|
+
sheets: vec![],
|
|
90
|
+
metadata: office_metadata.unwrap_or_default(),
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// For .xla (legacy add-in), try XLS parsing but gracefully return empty workbook on failure
|
|
97
|
+
if lower_path.ends_with(".xla") {
|
|
98
|
+
let file = std::fs::File::open(file_path)?;
|
|
99
|
+
match calamine::Xls::new(std::io::BufReader::new(file)) {
|
|
100
|
+
Ok(workbook) => {
|
|
101
|
+
return process_workbook(workbook, office_metadata);
|
|
102
|
+
}
|
|
103
|
+
Err(_) => {
|
|
104
|
+
return Ok(ExcelWorkbook {
|
|
105
|
+
sheets: vec![],
|
|
106
|
+
metadata: office_metadata.unwrap_or_default(),
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// For .xlsb (binary spreadsheet), try XLSB parsing but gracefully return empty workbook on failure
|
|
113
|
+
if lower_path.ends_with(".xlsb") {
|
|
114
|
+
let file = std::fs::File::open(file_path)?;
|
|
115
|
+
match calamine::Xlsb::new(std::io::BufReader::new(file)) {
|
|
116
|
+
Ok(workbook) => {
|
|
117
|
+
return process_workbook(workbook, office_metadata);
|
|
118
|
+
}
|
|
119
|
+
Err(_) => {
|
|
120
|
+
return Ok(ExcelWorkbook {
|
|
121
|
+
sheets: vec![],
|
|
122
|
+
metadata: office_metadata.unwrap_or_default(),
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
83
128
|
// For other formats, use open_workbook_auto
|
|
84
129
|
let workbook = match open_workbook_auto(Path::new(file_path)) {
|
|
85
130
|
Ok(wb) => wb,
|
|
@@ -109,25 +154,66 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
|
|
|
109
154
|
#[cfg(not(feature = "office"))]
|
|
110
155
|
let office_metadata: Option<HashMap<String, String>> = None;
|
|
111
156
|
|
|
112
|
-
let cursor = Cursor::new(data);
|
|
113
|
-
|
|
114
157
|
match file_extension.to_lowercase().as_str() {
|
|
115
|
-
|
|
158
|
+
// Standard XLSX-format files: propagate errors
|
|
159
|
+
".xlsx" | ".xlsm" | ".xltm" => {
|
|
160
|
+
let cursor = Cursor::new(data);
|
|
116
161
|
let workbook = calamine::Xlsx::new(cursor)
|
|
117
162
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
|
|
118
163
|
process_xlsx_workbook(workbook, office_metadata)
|
|
119
164
|
}
|
|
120
|
-
|
|
165
|
+
// Exotic format: .xlam (Excel add-in) - may not contain proper workbook data
|
|
166
|
+
".xlam" => {
|
|
167
|
+
let cursor = Cursor::new(data);
|
|
168
|
+
match calamine::Xlsx::new(cursor) {
|
|
169
|
+
Ok(workbook) => process_xlsx_workbook(workbook, office_metadata),
|
|
170
|
+
Err(_) => {
|
|
171
|
+
// .xlam files may not contain proper workbook data - return empty workbook
|
|
172
|
+
Ok(ExcelWorkbook {
|
|
173
|
+
sheets: vec![],
|
|
174
|
+
metadata: office_metadata.unwrap_or_default(),
|
|
175
|
+
})
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
// Standard XLS format: propagate errors
|
|
180
|
+
".xls" => {
|
|
181
|
+
let cursor = Cursor::new(data);
|
|
121
182
|
let workbook = calamine::Xls::new(cursor)
|
|
122
183
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLS: {}", e)))?;
|
|
123
184
|
process_workbook(workbook, office_metadata)
|
|
124
185
|
}
|
|
186
|
+
// Exotic format: .xla (legacy add-in) - may not contain proper workbook data
|
|
187
|
+
".xla" => {
|
|
188
|
+
let cursor = Cursor::new(data);
|
|
189
|
+
match calamine::Xls::new(cursor) {
|
|
190
|
+
Ok(workbook) => process_workbook(workbook, office_metadata),
|
|
191
|
+
Err(_) => {
|
|
192
|
+
// .xla files may not contain proper workbook data - return empty workbook
|
|
193
|
+
Ok(ExcelWorkbook {
|
|
194
|
+
sheets: vec![],
|
|
195
|
+
metadata: office_metadata.unwrap_or_default(),
|
|
196
|
+
})
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
// Exotic format: .xlsb (binary spreadsheet) - may not contain proper workbook data
|
|
125
201
|
".xlsb" => {
|
|
126
|
-
let
|
|
127
|
-
|
|
128
|
-
|
|
202
|
+
let cursor = Cursor::new(data);
|
|
203
|
+
match calamine::Xlsb::new(cursor) {
|
|
204
|
+
Ok(workbook) => process_workbook(workbook, office_metadata),
|
|
205
|
+
Err(_) => {
|
|
206
|
+
// .xlsb files may not contain proper workbook data - return empty workbook
|
|
207
|
+
Ok(ExcelWorkbook {
|
|
208
|
+
sheets: vec![],
|
|
209
|
+
metadata: office_metadata.unwrap_or_default(),
|
|
210
|
+
})
|
|
211
|
+
}
|
|
212
|
+
}
|
|
129
213
|
}
|
|
214
|
+
// Standard OpenDocument format
|
|
130
215
|
".ods" => {
|
|
216
|
+
let cursor = Cursor::new(data);
|
|
131
217
|
let workbook = calamine::Ods::new(cursor)
|
|
132
218
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse ODS: {}", e)))?;
|
|
133
219
|
process_workbook(workbook, office_metadata)
|
|
@@ -96,14 +96,31 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
96
96
|
}
|
|
97
97
|
})?;
|
|
98
98
|
|
|
99
|
-
let document = pdfium.load_pdf_from_byte_slice(content, None)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
99
|
+
let document = match pdfium.load_pdf_from_byte_slice(content, None) {
|
|
100
|
+
Ok(doc) => doc,
|
|
101
|
+
Err(e) => {
|
|
102
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
103
|
+
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
104
|
+
#[cfg(feature = "otel")]
|
|
105
|
+
tracing::warn!("Password-protected PDF encountered in WASM, returning empty result");
|
|
106
|
+
} else {
|
|
107
|
+
#[cfg(feature = "otel")]
|
|
108
|
+
tracing::warn!("Malformed or invalid PDF encountered in WASM: {}", err_msg);
|
|
109
|
+
}
|
|
110
|
+
return Ok(ExtractionResult {
|
|
111
|
+
content: String::new(),
|
|
112
|
+
mime_type: mime_type.to_string().into(),
|
|
113
|
+
metadata: Metadata::default(),
|
|
114
|
+
tables: vec![],
|
|
115
|
+
detected_languages: None,
|
|
116
|
+
chunks: None,
|
|
117
|
+
images: None,
|
|
118
|
+
pages: None,
|
|
119
|
+
djot_content: None,
|
|
120
|
+
elements: None,
|
|
121
|
+
});
|
|
105
122
|
}
|
|
106
|
-
}
|
|
123
|
+
};
|
|
107
124
|
|
|
108
125
|
extract_all_from_document(&document, config)?
|
|
109
126
|
}
|
|
@@ -113,23 +130,27 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
113
130
|
let content_owned = content.to_vec();
|
|
114
131
|
let span = tracing::Span::current();
|
|
115
132
|
let config_owned = config.clone();
|
|
116
|
-
tokio::task::spawn_blocking(move || {
|
|
133
|
+
let result = tokio::task::spawn_blocking(move || {
|
|
117
134
|
let _guard = span.entered();
|
|
118
135
|
|
|
119
136
|
let pdfium =
|
|
120
137
|
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
121
138
|
|
|
122
|
-
let document = pdfium.load_pdf_from_byte_slice(&content_owned, None)
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
139
|
+
let document = match pdfium.load_pdf_from_byte_slice(&content_owned, None) {
|
|
140
|
+
Ok(doc) => doc,
|
|
141
|
+
Err(e) => {
|
|
142
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
143
|
+
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
144
|
+
return Err(PdfError::PasswordRequired);
|
|
145
|
+
} else {
|
|
146
|
+
return Err(PdfError::InvalidPdf(err_msg));
|
|
147
|
+
}
|
|
128
148
|
}
|
|
129
|
-
}
|
|
149
|
+
};
|
|
130
150
|
|
|
131
151
|
let (pdf_metadata, native_text, tables, page_contents, _boundaries) =
|
|
132
|
-
extract_all_from_document(&document, &config_owned)
|
|
152
|
+
extract_all_from_document(&document, &config_owned)
|
|
153
|
+
.map_err(|e| PdfError::ExtractionFailed(e.to_string()))?;
|
|
133
154
|
|
|
134
155
|
if let Some(page_cfg) = config_owned.pages.as_ref()
|
|
135
156
|
&& page_cfg.extract_pages
|
|
@@ -138,11 +159,10 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
138
159
|
return Err(PdfError::ExtractionFailed(
|
|
139
160
|
"Page extraction was configured but no page data was extracted in batch mode"
|
|
140
161
|
.to_string(),
|
|
141
|
-
)
|
|
142
|
-
.into());
|
|
162
|
+
));
|
|
143
163
|
}
|
|
144
164
|
|
|
145
|
-
Ok::<_, crate::error::
|
|
165
|
+
Ok::<_, crate::pdf::error::PdfError>((
|
|
146
166
|
pdf_metadata,
|
|
147
167
|
native_text,
|
|
148
168
|
tables,
|
|
@@ -151,19 +171,67 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
151
171
|
))
|
|
152
172
|
})
|
|
153
173
|
.await
|
|
154
|
-
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))
|
|
174
|
+
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))?;
|
|
175
|
+
|
|
176
|
+
match result {
|
|
177
|
+
Ok(tuple) => tuple,
|
|
178
|
+
Err(e) => {
|
|
179
|
+
let err_msg = e.to_string();
|
|
180
|
+
if err_msg.contains("password")
|
|
181
|
+
|| err_msg.contains("Password")
|
|
182
|
+
|| err_msg.contains("password-protected")
|
|
183
|
+
{
|
|
184
|
+
#[cfg(feature = "otel")]
|
|
185
|
+
tracing::warn!(
|
|
186
|
+
"Password-protected PDF encountered in batch mode, returning empty result"
|
|
187
|
+
);
|
|
188
|
+
} else {
|
|
189
|
+
#[cfg(feature = "otel")]
|
|
190
|
+
tracing::warn!("Malformed or invalid PDF encountered in batch mode: {}", err_msg);
|
|
191
|
+
}
|
|
192
|
+
return Ok(ExtractionResult {
|
|
193
|
+
content: String::new(),
|
|
194
|
+
mime_type: mime_type.to_string().into(),
|
|
195
|
+
metadata: Metadata::default(),
|
|
196
|
+
tables: vec![],
|
|
197
|
+
detected_languages: None,
|
|
198
|
+
chunks: None,
|
|
199
|
+
images: None,
|
|
200
|
+
pages: None,
|
|
201
|
+
djot_content: None,
|
|
202
|
+
elements: None,
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
}
|
|
155
206
|
} else {
|
|
156
207
|
let pdfium =
|
|
157
208
|
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
158
209
|
|
|
159
|
-
let document = pdfium.load_pdf_from_byte_slice(content, None)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
210
|
+
let document = match pdfium.load_pdf_from_byte_slice(content, None) {
|
|
211
|
+
Ok(doc) => doc,
|
|
212
|
+
Err(e) => {
|
|
213
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
214
|
+
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
215
|
+
#[cfg(feature = "otel")]
|
|
216
|
+
tracing::warn!("Password-protected PDF encountered, returning empty result");
|
|
217
|
+
} else {
|
|
218
|
+
#[cfg(feature = "otel")]
|
|
219
|
+
tracing::warn!("Malformed or invalid PDF encountered: {}", err_msg);
|
|
220
|
+
}
|
|
221
|
+
return Ok(ExtractionResult {
|
|
222
|
+
content: String::new(),
|
|
223
|
+
mime_type: mime_type.to_string().into(),
|
|
224
|
+
metadata: Metadata::default(),
|
|
225
|
+
tables: vec![],
|
|
226
|
+
detected_languages: None,
|
|
227
|
+
chunks: None,
|
|
228
|
+
images: None,
|
|
229
|
+
pages: None,
|
|
230
|
+
djot_content: None,
|
|
231
|
+
elements: None,
|
|
232
|
+
});
|
|
165
233
|
}
|
|
166
|
-
}
|
|
234
|
+
};
|
|
167
235
|
|
|
168
236
|
extract_all_from_document(&document, config)?
|
|
169
237
|
}
|
|
@@ -173,14 +241,31 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
173
241
|
let pdfium =
|
|
174
242
|
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
175
243
|
|
|
176
|
-
let document = pdfium.load_pdf_from_byte_slice(content, None)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
244
|
+
let document = match pdfium.load_pdf_from_byte_slice(content, None) {
|
|
245
|
+
Ok(doc) => doc,
|
|
246
|
+
Err(e) => {
|
|
247
|
+
let err_msg = crate::pdf::error::format_pdfium_error(e);
|
|
248
|
+
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
249
|
+
#[cfg(feature = "otel")]
|
|
250
|
+
tracing::warn!("Password-protected PDF encountered, returning empty result");
|
|
251
|
+
} else {
|
|
252
|
+
#[cfg(feature = "otel")]
|
|
253
|
+
tracing::warn!("Malformed or invalid PDF encountered: {}", err_msg);
|
|
254
|
+
}
|
|
255
|
+
return Ok(ExtractionResult {
|
|
256
|
+
content: String::new(),
|
|
257
|
+
mime_type: mime_type.to_string().into(),
|
|
258
|
+
metadata: Metadata::default(),
|
|
259
|
+
tables: vec![],
|
|
260
|
+
detected_languages: None,
|
|
261
|
+
chunks: None,
|
|
262
|
+
images: None,
|
|
263
|
+
pages: None,
|
|
264
|
+
djot_content: None,
|
|
265
|
+
elements: None,
|
|
266
|
+
});
|
|
182
267
|
}
|
|
183
|
-
}
|
|
268
|
+
};
|
|
184
269
|
|
|
185
270
|
extract_all_from_document(&document, config)?
|
|
186
271
|
}
|
|
@@ -628,4 +713,52 @@ mod tests {
|
|
|
628
713
|
let extractor = PdfExtractor::new();
|
|
629
714
|
assert_eq!(extractor.name(), "pdf-extractor");
|
|
630
715
|
}
|
|
716
|
+
|
|
717
|
+
#[tokio::test]
|
|
718
|
+
#[cfg(feature = "pdf")]
|
|
719
|
+
async fn test_pdf_gracefully_handles_malformed_pdf() {
|
|
720
|
+
let extractor = PdfExtractor::new();
|
|
721
|
+
let config = ExtractionConfig::default();
|
|
722
|
+
|
|
723
|
+
// Create a malformed PDF: just some random bytes that start with %PDF but are incomplete
|
|
724
|
+
let malformed_pdf = b"%PDF-1.4\nmalformed content that is not a valid PDF".to_vec();
|
|
725
|
+
|
|
726
|
+
let result = extractor
|
|
727
|
+
.extract_bytes(&malformed_pdf, "application/pdf", &config)
|
|
728
|
+
.await;
|
|
729
|
+
|
|
730
|
+
assert!(
|
|
731
|
+
result.is_ok(),
|
|
732
|
+
"Malformed PDF should be handled gracefully, not return an error"
|
|
733
|
+
);
|
|
734
|
+
|
|
735
|
+
let extraction_result = result.unwrap();
|
|
736
|
+
assert_eq!(extraction_result.content, "", "Malformed PDF should have empty content");
|
|
737
|
+
assert_eq!(extraction_result.tables.len(), 0, "Malformed PDF should have no tables");
|
|
738
|
+
assert_eq!(
|
|
739
|
+
extraction_result.mime_type.as_ref() as &str,
|
|
740
|
+
"application/pdf",
|
|
741
|
+
"MIME type should be preserved"
|
|
742
|
+
);
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
#[tokio::test]
|
|
746
|
+
#[cfg(feature = "pdf")]
|
|
747
|
+
async fn test_pdf_gracefully_handles_invalid_bytes() {
|
|
748
|
+
let extractor = PdfExtractor::new();
|
|
749
|
+
let config = ExtractionConfig::default();
|
|
750
|
+
|
|
751
|
+
// Create completely invalid data (not a PDF at all)
|
|
752
|
+
let invalid_data = b"This is not a PDF file, just random text".to_vec();
|
|
753
|
+
|
|
754
|
+
let result = extractor.extract_bytes(&invalid_data, "application/pdf", &config).await;
|
|
755
|
+
|
|
756
|
+
assert!(
|
|
757
|
+
result.is_ok(),
|
|
758
|
+
"Invalid PDF data should be handled gracefully, not return an error"
|
|
759
|
+
);
|
|
760
|
+
|
|
761
|
+
let extraction_result = result.unwrap();
|
|
762
|
+
assert_eq!(extraction_result.content, "", "Invalid PDF should have empty content");
|
|
763
|
+
}
|
|
631
764
|
}
|
|
@@ -254,14 +254,17 @@ pub(crate) fn bind_pdfium(
|
|
|
254
254
|
mod tests {
|
|
255
255
|
use super::*;
|
|
256
256
|
use crate::pdf::error::PdfError;
|
|
257
|
+
use serial_test::serial;
|
|
257
258
|
|
|
258
259
|
#[test]
|
|
260
|
+
#[serial]
|
|
259
261
|
fn test_bind_pdfium_lazy_initialization() {
|
|
260
262
|
let result = bind_pdfium(PdfError::TextExtractionFailed, "test context");
|
|
261
263
|
assert!(result.is_ok(), "First bind_pdfium call should succeed");
|
|
262
264
|
}
|
|
263
265
|
|
|
264
266
|
#[test]
|
|
267
|
+
#[serial]
|
|
265
268
|
fn test_bind_pdfium_multiple_calls() {
|
|
266
269
|
// First call - acquire lock, test success, then drop handle to release lock
|
|
267
270
|
{
|
|
@@ -277,6 +280,7 @@ mod tests {
|
|
|
277
280
|
}
|
|
278
281
|
|
|
279
282
|
#[test]
|
|
283
|
+
#[serial]
|
|
280
284
|
fn test_bind_pdfium_returns_same_instance() {
|
|
281
285
|
// Get pointer from first handle, then drop it to release lock
|
|
282
286
|
let ptr1 = {
|
|
@@ -295,6 +299,7 @@ mod tests {
|
|
|
295
299
|
}
|
|
296
300
|
|
|
297
301
|
#[test]
|
|
302
|
+
#[serial]
|
|
298
303
|
fn test_bind_pdfium_error_mapping() {
|
|
299
304
|
let map_err = |msg: String| PdfError::TextExtractionFailed(msg);
|
|
300
305
|
|
|
@@ -308,6 +313,7 @@ mod tests {
|
|
|
308
313
|
}
|
|
309
314
|
|
|
310
315
|
#[test]
|
|
316
|
+
#[serial]
|
|
311
317
|
fn test_pdfium_handle_deref() {
|
|
312
318
|
let handle = bind_pdfium(PdfError::TextExtractionFailed, "test").unwrap();
|
|
313
319
|
|
|
@@ -169,14 +169,17 @@ fn calculate_optimal_dpi(
|
|
|
169
169
|
#[cfg(test)]
|
|
170
170
|
mod tests {
|
|
171
171
|
use super::*;
|
|
172
|
+
use serial_test::serial;
|
|
172
173
|
|
|
173
174
|
#[test]
|
|
175
|
+
#[serial]
|
|
174
176
|
fn test_renderer_creation() {
|
|
175
177
|
let result = PdfRenderer::new();
|
|
176
178
|
assert!(result.is_ok());
|
|
177
179
|
}
|
|
178
180
|
|
|
179
181
|
#[test]
|
|
182
|
+
#[serial]
|
|
180
183
|
fn test_render_invalid_pdf() {
|
|
181
184
|
let renderer = PdfRenderer::new().unwrap();
|
|
182
185
|
let options = PageRenderOptions::default();
|
|
@@ -186,6 +189,7 @@ mod tests {
|
|
|
186
189
|
}
|
|
187
190
|
|
|
188
191
|
#[test]
|
|
192
|
+
#[serial]
|
|
189
193
|
fn test_render_page_not_found() {
|
|
190
194
|
let renderer = PdfRenderer::new().unwrap();
|
|
191
195
|
let options = PageRenderOptions::default();
|
|
@@ -242,6 +246,7 @@ mod tests {
|
|
|
242
246
|
}
|
|
243
247
|
|
|
244
248
|
#[test]
|
|
249
|
+
#[serial]
|
|
245
250
|
fn test_render_all_pages_empty_pdf() {
|
|
246
251
|
let renderer = PdfRenderer::new().unwrap();
|
|
247
252
|
let options = PageRenderOptions::default();
|
|
@@ -250,6 +255,7 @@ mod tests {
|
|
|
250
255
|
}
|
|
251
256
|
|
|
252
257
|
#[test]
|
|
258
|
+
#[serial]
|
|
253
259
|
fn test_render_page_with_password_none() {
|
|
254
260
|
let renderer = PdfRenderer::new().unwrap();
|
|
255
261
|
let options = PageRenderOptions::default();
|
|
@@ -258,6 +264,7 @@ mod tests {
|
|
|
258
264
|
}
|
|
259
265
|
|
|
260
266
|
#[test]
|
|
267
|
+
#[serial]
|
|
261
268
|
fn test_render_all_pages_with_password_none() {
|
|
262
269
|
let renderer = PdfRenderer::new().unwrap();
|
|
263
270
|
let options = PageRenderOptions::default();
|
|
@@ -266,6 +273,7 @@ mod tests {
|
|
|
266
273
|
}
|
|
267
274
|
|
|
268
275
|
#[test]
|
|
276
|
+
#[serial]
|
|
269
277
|
fn test_render_page_to_image_function() {
|
|
270
278
|
let options = PageRenderOptions::default();
|
|
271
279
|
let result = render_page_to_image(b"not a pdf", 0, &options);
|
|
@@ -348,6 +356,7 @@ mod tests {
|
|
|
348
356
|
}
|
|
349
357
|
|
|
350
358
|
#[test]
|
|
359
|
+
#[serial]
|
|
351
360
|
fn test_render_empty_bytes() {
|
|
352
361
|
let renderer = PdfRenderer::new().unwrap();
|
|
353
362
|
let options = PageRenderOptions::default();
|
|
@@ -441,14 +441,17 @@ fn extract_page_hierarchy(
|
|
|
441
441
|
#[cfg(test)]
|
|
442
442
|
mod tests {
|
|
443
443
|
use super::*;
|
|
444
|
+
use serial_test::serial;
|
|
444
445
|
|
|
445
446
|
#[test]
|
|
447
|
+
#[serial]
|
|
446
448
|
fn test_extractor_creation() {
|
|
447
449
|
let result = PdfTextExtractor::new();
|
|
448
450
|
assert!(result.is_ok());
|
|
449
451
|
}
|
|
450
452
|
|
|
451
453
|
#[test]
|
|
454
|
+
#[serial]
|
|
452
455
|
fn test_extract_empty_pdf() {
|
|
453
456
|
let extractor = PdfTextExtractor::new().unwrap();
|
|
454
457
|
let result = extractor.extract_text(b"");
|
|
@@ -456,6 +459,7 @@ mod tests {
|
|
|
456
459
|
}
|
|
457
460
|
|
|
458
461
|
#[test]
|
|
462
|
+
#[serial]
|
|
459
463
|
fn test_extract_invalid_pdf() {
|
|
460
464
|
let extractor = PdfTextExtractor::new().unwrap();
|
|
461
465
|
let result = extractor.extract_text(b"not a pdf");
|
|
@@ -464,6 +468,7 @@ mod tests {
|
|
|
464
468
|
}
|
|
465
469
|
|
|
466
470
|
#[test]
|
|
471
|
+
#[serial]
|
|
467
472
|
fn test_password_required_detection() {
|
|
468
473
|
let extractor = PdfTextExtractor::new().unwrap();
|
|
469
474
|
let encrypted_pdf = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
|
|
@@ -475,6 +480,7 @@ mod tests {
|
|
|
475
480
|
}
|
|
476
481
|
|
|
477
482
|
#[test]
|
|
483
|
+
#[serial]
|
|
478
484
|
fn test_extract_text_with_passwords_empty_list() {
|
|
479
485
|
let extractor = PdfTextExtractor::new().unwrap();
|
|
480
486
|
let result = extractor.extract_text_with_passwords(b"not a pdf", &[]);
|
|
@@ -485,6 +491,7 @@ mod tests {
|
|
|
485
491
|
#[cfg(test)]
|
|
486
492
|
mod cache_regression_tests {
|
|
487
493
|
use super::*;
|
|
494
|
+
use serial_test::serial;
|
|
488
495
|
use std::time::Instant;
|
|
489
496
|
|
|
490
497
|
/// Test that multiple extractions of the same document produce consistent results.
|
|
@@ -497,6 +504,7 @@ mod cache_regression_tests {
|
|
|
497
504
|
/// 1. Multiple extractions produce identical text content
|
|
498
505
|
/// 2. The singleton pattern provides consistent extraction behavior
|
|
499
506
|
#[test]
|
|
507
|
+
#[serial]
|
|
500
508
|
fn test_no_global_cache_between_documents() {
|
|
501
509
|
let pdf_bytes = std::fs::read("../../test_documents/pdf/fake_memo.pdf").expect("Failed to read PDF");
|
|
502
510
|
|