kreuzberg 4.2.13 → 4.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3c2053b10256948a215ff0d3552894991e801497ac4b2480eca3c98bb645cc27
4
- data.tar.gz: 324b6147e172ecedb2338fab1b14ce2022a8b9c2d6be7fd86ac0f862d81ef7ce
3
+ metadata.gz: da61e06dfa4643e485c13636998888f03699816b7462087c9df6c9639d53fc45
4
+ data.tar.gz: 20a9c88f3eac809d2d158e15ea3747c425d47b3af0e2bf93825c831c9aa11aa9
5
5
  SHA512:
6
- metadata.gz: 84a6636111d240c99eb17546f80c1df31117c700d78282c18a67a79aa613021d33988cbc1b00d5bc62bb2ffeef8c2a8f1759e137329de8f30af7f61b6db1a55b
7
- data.tar.gz: 7628ecce3c6fb44c06a9546f2db696ae3486de35e0a05195cbea752bc6f78e573162e6305aec8c8ae0ca0fdbb6709a3e75752b822dbd8aed637eff9577c3e020
6
+ metadata.gz: 7be55db6494d45de03b3fee1271e1bc151193709098bdfe94fb7a5fb33159dd9a0b8b08fffd5ed2d3b24f3f3766c0bb1e81b42319d25b529088afe7e6a4c52d6
7
+ data.tar.gz: bd94796f90094ca64775c0ded247bc216fdfe6ee50d4c6258685ccc5b5b33e1a69546cd892ac7dce77eee72123cbaf6ce239ee932a117c8ad5dfe801a9a548bf
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.13)
4
+ kreuzberg (4.2.14)
5
5
  rb_sys (~> 0.9.119)
6
6
 
7
7
  GEM
@@ -209,7 +209,7 @@ CHECKSUMS
209
209
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
210
210
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
211
211
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
212
- kreuzberg (4.2.13)
212
+ kreuzberg (4.2.14)
213
213
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
214
214
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
215
215
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.13" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.14" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.2.13"
40
+ version = "4.2.14"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.13'
4
+ VERSION = '4.2.14'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.13"
6
+ version = "4.2.14"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.13"
3
+ version = "4.2.14"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.13 Release**
20
+ > **🚀 Version 4.2.14 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -3,7 +3,7 @@
3
3
  //! This module provides shared utilities used across extraction modules.
4
4
 
5
5
  use crate::plugins::DocumentExtractor;
6
- #[cfg(feature = "office")]
6
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
7
7
  use crate::utils::intern_mime_type;
8
8
  use crate::utils::{PoolSizeHint, estimate_pool_size};
9
9
  use crate::{KreuzbergError, Result};
@@ -65,7 +65,7 @@ pub fn get_pool_sizing_hint(file_size: u64, mime_type: &str) -> PoolSizeHint {
65
65
  ///
66
66
  /// For pre-interned MIME types (all common types), this is O(1) pointer dereference.
67
67
  /// For unknown MIME types, this allocates once per unique type and caches the result.
68
- #[cfg(feature = "office")]
68
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
69
69
  pub(in crate::core::extractor) fn pool_mime_type(mime_type: &str) -> String {
70
70
  intern_mime_type(mime_type).to_string()
71
71
  }
@@ -104,7 +104,7 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
104
104
  m.insert("zip", "application/zip");
105
105
  m.insert("tar", "application/x-tar");
106
106
  m.insert("gz", "application/gzip");
107
- m.insert("tgz", "application/x-tar");
107
+ m.insert("tgz", "application/gzip");
108
108
  m.insert("7z", "application/x-7z-compressed");
109
109
 
110
110
  m.insert("rst", "text/x-rst");
@@ -2,6 +2,10 @@
2
2
  //!
3
3
  //! Provides functions for decompressing gzip files and extracting
4
4
  //! metadata and text content from the compressed data.
5
+ //!
6
+ //! When a gzip file contains a TAR archive (e.g., .tar.gz files),
7
+ //! this module automatically detects the TAR format and delegates
8
+ //! to the TAR extraction functions.
5
9
 
6
10
  use super::{ArchiveEntry, ArchiveMetadata};
7
11
  use crate::error::{KreuzbergError, Result};
@@ -10,6 +14,14 @@ use flate2::read::GzDecoder;
10
14
  use std::collections::HashMap;
11
15
  use std::io::Read;
12
16
 
17
+ /// Check if data looks like a TAR archive (has "ustar" magic at offset 257).
18
+ ///
19
+ /// The TAR format has a standard USTAR header starting at offset 257,
20
+ /// which helps identify TAR archives that have been gzip-compressed.
21
+ fn is_tar_archive(data: &[u8]) -> bool {
22
+ data.len() > 262 && &data[257..262] == b"ustar"
23
+ }
24
+
13
25
  /// Decompress gzip bytes with a size limit to prevent decompression bombs.
14
26
  fn decompress_gzip_limited(bytes: &[u8], max_size: u64) -> Result<Vec<u8>> {
15
27
  let decoder = GzDecoder::new(bytes);
@@ -38,9 +50,19 @@ pub fn decompress_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<Vec<u8>>
38
50
  ///
39
51
  /// This avoids the overhead of decompressing the data multiple times when both
40
52
  /// metadata and text content are needed.
53
+ ///
54
+ /// If the decompressed data is a TAR archive, delegates to TAR extraction functions.
41
55
  pub fn extract_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<(ArchiveMetadata, HashMap<String, String>)> {
42
56
  let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
43
57
 
58
+ // Check if the decompressed data is a TAR archive
59
+ if is_tar_archive(&decompressed) {
60
+ let mut metadata = super::tar::extract_tar_metadata(&decompressed, limits)?;
61
+ metadata.format = "GZIP+TAR".to_string();
62
+ let contents = super::tar::extract_tar_text_content(&decompressed, limits)?;
63
+ return Ok((metadata, contents));
64
+ }
65
+
44
66
  // Re-read header for filename (lightweight - no decompression)
45
67
  let mut decoder = GzDecoder::new(bytes);
46
68
  let mut _discard = [0u8; 1];
@@ -77,9 +99,18 @@ pub fn extract_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<(ArchiveMet
77
99
  ///
78
100
  /// Gzip wraps a single stream, so the metadata contains one entry
79
101
  /// with the original filename (from gzip header) and decompressed size.
102
+ ///
103
+ /// If the decompressed data is a TAR archive, delegates to TAR extraction.
80
104
  pub fn extract_gzip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
81
105
  let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
82
106
 
107
+ // Check if the decompressed data is a TAR archive
108
+ if is_tar_archive(&decompressed) {
109
+ let mut metadata = super::tar::extract_tar_metadata(&decompressed, limits)?;
110
+ metadata.format = "GZIP+TAR".to_string();
111
+ return Ok(metadata);
112
+ }
113
+
83
114
  let mut decoder = GzDecoder::new(bytes);
84
115
  let mut _discard = [0u8; 1];
85
116
  let _ = decoder.read(&mut _discard);
@@ -107,9 +138,16 @@ pub fn extract_gzip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<Ar
107
138
  /// Extract text content from a gzip-compressed file.
108
139
  ///
109
140
  /// Decompresses and attempts to read the result as UTF-8 text.
141
+ ///
142
+ /// If the decompressed data is a TAR archive, delegates to TAR extraction.
110
143
  pub fn extract_gzip_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
111
144
  let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
112
145
 
146
+ // Check if the decompressed data is a TAR archive
147
+ if is_tar_archive(&decompressed) {
148
+ return super::tar::extract_tar_text_content(&decompressed, limits);
149
+ }
150
+
113
151
  let mut decoder = GzDecoder::new(bytes);
114
152
  let mut _discard = [0u8; 1];
115
153
  let _ = decoder.read(&mut _discard);
@@ -878,4 +878,126 @@ mod tests {
878
878
  let result = extract_gzip_metadata(&compressed, &limits);
879
879
  assert!(result.is_err());
880
880
  }
881
+
882
+ #[test]
883
+ fn test_extract_gzip_compressed_tar_metadata() {
884
+ use flate2::Compression;
885
+ use flate2::write::GzEncoder;
886
+ use std::io::Write;
887
+
888
+ // Create a tar archive
889
+ let mut tar_data = Vec::new();
890
+ {
891
+ let mut tar = TarBuilder::new(&mut tar_data);
892
+
893
+ let data1 = b"Hello from tar.gz!";
894
+ let mut header1 = ::tar::Header::new_gnu();
895
+ header1.set_path("test.txt").unwrap();
896
+ header1.set_size(data1.len() as u64);
897
+ header1.set_cksum();
898
+ tar.append(&header1, &data1[..]).unwrap();
899
+
900
+ let data2 = b"# Markdown file";
901
+ let mut header2 = ::tar::Header::new_gnu();
902
+ header2.set_path("readme.md").unwrap();
903
+ header2.set_size(data2.len() as u64);
904
+ header2.set_cksum();
905
+ tar.append(&header2, &data2[..]).unwrap();
906
+
907
+ tar.finish().unwrap();
908
+ }
909
+
910
+ // Gzip compress the tar data
911
+ let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
912
+ encoder.write_all(&tar_data).unwrap();
913
+ let gzip_compressed = encoder.finish().unwrap();
914
+
915
+ // Extract metadata from the gzip-compressed tar
916
+ let metadata = extract_gzip_metadata(&gzip_compressed, &default_limits()).unwrap();
917
+
918
+ assert_eq!(metadata.format, "GZIP+TAR");
919
+ assert_eq!(metadata.file_count, 2);
920
+ assert_eq!(metadata.file_list.len(), 2);
921
+ assert!(metadata.total_size > 0);
922
+
923
+ // Verify file paths are preserved
924
+ let paths: Vec<&str> = metadata.file_list.iter().map(|e| e.path.as_str()).collect();
925
+ assert!(paths.contains(&"test.txt"));
926
+ assert!(paths.contains(&"readme.md"));
927
+ }
928
+
929
+ #[test]
930
+ fn test_extract_gzip_compressed_tar_text_content() {
931
+ use flate2::Compression;
932
+ use flate2::write::GzEncoder;
933
+ use std::io::Write;
934
+
935
+ // Create a tar archive
936
+ let mut tar_data = Vec::new();
937
+ {
938
+ let mut tar = TarBuilder::new(&mut tar_data);
939
+
940
+ let data1 = b"Hello from tar.gz!";
941
+ let mut header1 = ::tar::Header::new_gnu();
942
+ header1.set_path("test.txt").unwrap();
943
+ header1.set_size(data1.len() as u64);
944
+ header1.set_cksum();
945
+ tar.append(&header1, &data1[..]).unwrap();
946
+
947
+ let data2 = b"# Markdown content";
948
+ let mut header2 = ::tar::Header::new_gnu();
949
+ header2.set_path("readme.md").unwrap();
950
+ header2.set_size(data2.len() as u64);
951
+ header2.set_cksum();
952
+ tar.append(&header2, &data2[..]).unwrap();
953
+
954
+ tar.finish().unwrap();
955
+ }
956
+
957
+ // Gzip compress the tar data
958
+ let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
959
+ encoder.write_all(&tar_data).unwrap();
960
+ let gzip_compressed = encoder.finish().unwrap();
961
+
962
+ // Extract text content from the gzip-compressed tar
963
+ let contents = extract_gzip_text_content(&gzip_compressed, &default_limits()).unwrap();
964
+
965
+ assert_eq!(contents.len(), 2);
966
+ assert_eq!(contents.get("test.txt").unwrap(), "Hello from tar.gz!");
967
+ assert_eq!(contents.get("readme.md").unwrap(), "# Markdown content");
968
+ }
969
+
970
+ #[test]
971
+ fn test_extract_gzip_compressed_tar_both() {
972
+ use flate2::Compression;
973
+ use flate2::write::GzEncoder;
974
+ use std::io::Write;
975
+
976
+ // Create a tar archive
977
+ let mut tar_data = Vec::new();
978
+ {
979
+ let mut tar = TarBuilder::new(&mut tar_data);
980
+
981
+ let data = b"Combined test content";
982
+ let mut header = ::tar::Header::new_gnu();
983
+ header.set_path("combined.txt").unwrap();
984
+ header.set_size(data.len() as u64);
985
+ header.set_cksum();
986
+ tar.append(&header, &data[..]).unwrap();
987
+
988
+ tar.finish().unwrap();
989
+ }
990
+
991
+ // Gzip compress the tar data
992
+ let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
993
+ encoder.write_all(&tar_data).unwrap();
994
+ let gzip_compressed = encoder.finish().unwrap();
995
+
996
+ // Extract both metadata and content in one call
997
+ let (metadata, contents) = extract_gzip(&gzip_compressed, &default_limits()).unwrap();
998
+
999
+ assert_eq!(metadata.format, "GZIP+TAR");
1000
+ assert_eq!(metadata.file_count, 1);
1001
+ assert_eq!(contents.get("combined.txt").unwrap(), "Combined test content");
1002
+ }
881
1003
  }
@@ -68,18 +68,63 @@ pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
68
68
  #[cfg(not(feature = "office"))]
69
69
  let office_metadata: Option<HashMap<String, String>> = None;
70
70
 
71
- // For XLSX files, use specialized handler with OOM protection
72
- if lower_path.ends_with(".xlsx")
73
- || lower_path.ends_with(".xlsm")
74
- || lower_path.ends_with(".xlam")
75
- || lower_path.ends_with(".xltm")
76
- {
71
+ // For standard XLSX-format files, use specialized handler with OOM protection
72
+ if lower_path.ends_with(".xlsx") || lower_path.ends_with(".xlsm") || lower_path.ends_with(".xltm") {
77
73
  let file = std::fs::File::open(file_path)?;
78
74
  let workbook = calamine::Xlsx::new(std::io::BufReader::new(file))
79
75
  .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
80
76
  return process_xlsx_workbook(workbook, office_metadata);
81
77
  }
82
78
 
79
+ // For .xlam (Excel add-in), try XLSX parsing but gracefully return empty workbook on failure
80
+ if lower_path.ends_with(".xlam") {
81
+ let file = std::fs::File::open(file_path)?;
82
+ match calamine::Xlsx::new(std::io::BufReader::new(file)) {
83
+ Ok(workbook) => {
84
+ return process_xlsx_workbook(workbook, office_metadata);
85
+ }
86
+ Err(_) => {
87
+ // .xlam files may not contain proper workbook data - return empty workbook
88
+ return Ok(ExcelWorkbook {
89
+ sheets: vec![],
90
+ metadata: office_metadata.unwrap_or_default(),
91
+ });
92
+ }
93
+ }
94
+ }
95
+
96
+ // For .xla (legacy add-in), try XLS parsing but gracefully return empty workbook on failure
97
+ if lower_path.ends_with(".xla") {
98
+ let file = std::fs::File::open(file_path)?;
99
+ match calamine::Xls::new(std::io::BufReader::new(file)) {
100
+ Ok(workbook) => {
101
+ return process_workbook(workbook, office_metadata);
102
+ }
103
+ Err(_) => {
104
+ return Ok(ExcelWorkbook {
105
+ sheets: vec![],
106
+ metadata: office_metadata.unwrap_or_default(),
107
+ });
108
+ }
109
+ }
110
+ }
111
+
112
+ // For .xlsb (binary spreadsheet), try XLSB parsing but gracefully return empty workbook on failure
113
+ if lower_path.ends_with(".xlsb") {
114
+ let file = std::fs::File::open(file_path)?;
115
+ match calamine::Xlsb::new(std::io::BufReader::new(file)) {
116
+ Ok(workbook) => {
117
+ return process_workbook(workbook, office_metadata);
118
+ }
119
+ Err(_) => {
120
+ return Ok(ExcelWorkbook {
121
+ sheets: vec![],
122
+ metadata: office_metadata.unwrap_or_default(),
123
+ });
124
+ }
125
+ }
126
+ }
127
+
83
128
  // For other formats, use open_workbook_auto
84
129
  let workbook = match open_workbook_auto(Path::new(file_path)) {
85
130
  Ok(wb) => wb,
@@ -109,25 +154,66 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
109
154
  #[cfg(not(feature = "office"))]
110
155
  let office_metadata: Option<HashMap<String, String>> = None;
111
156
 
112
- let cursor = Cursor::new(data);
113
-
114
157
  match file_extension.to_lowercase().as_str() {
115
- ".xlsx" | ".xlsm" | ".xlam" | ".xltm" => {
158
+ // Standard XLSX-format files: propagate errors
159
+ ".xlsx" | ".xlsm" | ".xltm" => {
160
+ let cursor = Cursor::new(data);
116
161
  let workbook = calamine::Xlsx::new(cursor)
117
162
  .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
118
163
  process_xlsx_workbook(workbook, office_metadata)
119
164
  }
120
- ".xls" | ".xla" => {
165
+ // Exotic format: .xlam (Excel add-in) - may not contain proper workbook data
166
+ ".xlam" => {
167
+ let cursor = Cursor::new(data);
168
+ match calamine::Xlsx::new(cursor) {
169
+ Ok(workbook) => process_xlsx_workbook(workbook, office_metadata),
170
+ Err(_) => {
171
+ // .xlam files may not contain proper workbook data - return empty workbook
172
+ Ok(ExcelWorkbook {
173
+ sheets: vec![],
174
+ metadata: office_metadata.unwrap_or_default(),
175
+ })
176
+ }
177
+ }
178
+ }
179
+ // Standard XLS format: propagate errors
180
+ ".xls" => {
181
+ let cursor = Cursor::new(data);
121
182
  let workbook = calamine::Xls::new(cursor)
122
183
  .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLS: {}", e)))?;
123
184
  process_workbook(workbook, office_metadata)
124
185
  }
186
+ // Exotic format: .xla (legacy add-in) - may not contain proper workbook data
187
+ ".xla" => {
188
+ let cursor = Cursor::new(data);
189
+ match calamine::Xls::new(cursor) {
190
+ Ok(workbook) => process_workbook(workbook, office_metadata),
191
+ Err(_) => {
192
+ // .xla files may not contain proper workbook data - return empty workbook
193
+ Ok(ExcelWorkbook {
194
+ sheets: vec![],
195
+ metadata: office_metadata.unwrap_or_default(),
196
+ })
197
+ }
198
+ }
199
+ }
200
+ // Exotic format: .xlsb (binary spreadsheet) - may not contain proper workbook data
125
201
  ".xlsb" => {
126
- let workbook = calamine::Xlsb::new(cursor)
127
- .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSB: {}", e)))?;
128
- process_workbook(workbook, office_metadata)
202
+ let cursor = Cursor::new(data);
203
+ match calamine::Xlsb::new(cursor) {
204
+ Ok(workbook) => process_workbook(workbook, office_metadata),
205
+ Err(_) => {
206
+ // .xlsb files may not contain proper workbook data - return empty workbook
207
+ Ok(ExcelWorkbook {
208
+ sheets: vec![],
209
+ metadata: office_metadata.unwrap_or_default(),
210
+ })
211
+ }
212
+ }
129
213
  }
214
+ // Standard OpenDocument format
130
215
  ".ods" => {
216
+ let cursor = Cursor::new(data);
131
217
  let workbook = calamine::Ods::new(cursor)
132
218
  .map_err(|e| KreuzbergError::parsing(format!("Failed to parse ODS: {}", e)))?;
133
219
  process_workbook(workbook, office_metadata)
@@ -96,14 +96,31 @@ impl DocumentExtractor for PdfExtractor {
96
96
  }
97
97
  })?;
98
98
 
99
- let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
100
- let err_msg = crate::pdf::error::format_pdfium_error(e);
101
- if err_msg.contains("password") || err_msg.contains("Password") {
102
- PdfError::PasswordRequired
103
- } else {
104
- PdfError::InvalidPdf(err_msg)
99
+ let document = match pdfium.load_pdf_from_byte_slice(content, None) {
100
+ Ok(doc) => doc,
101
+ Err(e) => {
102
+ let err_msg = crate::pdf::error::format_pdfium_error(e);
103
+ if err_msg.contains("password") || err_msg.contains("Password") {
104
+ #[cfg(feature = "otel")]
105
+ tracing::warn!("Password-protected PDF encountered in WASM, returning empty result");
106
+ } else {
107
+ #[cfg(feature = "otel")]
108
+ tracing::warn!("Malformed or invalid PDF encountered in WASM: {}", err_msg);
109
+ }
110
+ return Ok(ExtractionResult {
111
+ content: String::new(),
112
+ mime_type: mime_type.to_string().into(),
113
+ metadata: Metadata::default(),
114
+ tables: vec![],
115
+ detected_languages: None,
116
+ chunks: None,
117
+ images: None,
118
+ pages: None,
119
+ djot_content: None,
120
+ elements: None,
121
+ });
105
122
  }
106
- })?;
123
+ };
107
124
 
108
125
  extract_all_from_document(&document, config)?
109
126
  }
@@ -113,23 +130,27 @@ impl DocumentExtractor for PdfExtractor {
113
130
  let content_owned = content.to_vec();
114
131
  let span = tracing::Span::current();
115
132
  let config_owned = config.clone();
116
- tokio::task::spawn_blocking(move || {
133
+ let result = tokio::task::spawn_blocking(move || {
117
134
  let _guard = span.entered();
118
135
 
119
136
  let pdfium =
120
137
  crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
121
138
 
122
- let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
123
- let err_msg = crate::pdf::error::format_pdfium_error(e);
124
- if err_msg.contains("password") || err_msg.contains("Password") {
125
- PdfError::PasswordRequired
126
- } else {
127
- PdfError::InvalidPdf(err_msg)
139
+ let document = match pdfium.load_pdf_from_byte_slice(&content_owned, None) {
140
+ Ok(doc) => doc,
141
+ Err(e) => {
142
+ let err_msg = crate::pdf::error::format_pdfium_error(e);
143
+ if err_msg.contains("password") || err_msg.contains("Password") {
144
+ return Err(PdfError::PasswordRequired);
145
+ } else {
146
+ return Err(PdfError::InvalidPdf(err_msg));
147
+ }
128
148
  }
129
- })?;
149
+ };
130
150
 
131
151
  let (pdf_metadata, native_text, tables, page_contents, _boundaries) =
132
- extract_all_from_document(&document, &config_owned)?;
152
+ extract_all_from_document(&document, &config_owned)
153
+ .map_err(|e| PdfError::ExtractionFailed(e.to_string()))?;
133
154
 
134
155
  if let Some(page_cfg) = config_owned.pages.as_ref()
135
156
  && page_cfg.extract_pages
@@ -138,11 +159,10 @@ impl DocumentExtractor for PdfExtractor {
138
159
  return Err(PdfError::ExtractionFailed(
139
160
  "Page extraction was configured but no page data was extracted in batch mode"
140
161
  .to_string(),
141
- )
142
- .into());
162
+ ));
143
163
  }
144
164
 
145
- Ok::<_, crate::error::KreuzbergError>((
165
+ Ok::<_, crate::pdf::error::PdfError>((
146
166
  pdf_metadata,
147
167
  native_text,
148
168
  tables,
@@ -151,19 +171,67 @@ impl DocumentExtractor for PdfExtractor {
151
171
  ))
152
172
  })
153
173
  .await
154
- .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
174
+ .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))?;
175
+
176
+ match result {
177
+ Ok(tuple) => tuple,
178
+ Err(e) => {
179
+ let err_msg = e.to_string();
180
+ if err_msg.contains("password")
181
+ || err_msg.contains("Password")
182
+ || err_msg.contains("password-protected")
183
+ {
184
+ #[cfg(feature = "otel")]
185
+ tracing::warn!(
186
+ "Password-protected PDF encountered in batch mode, returning empty result"
187
+ );
188
+ } else {
189
+ #[cfg(feature = "otel")]
190
+ tracing::warn!("Malformed or invalid PDF encountered in batch mode: {}", err_msg);
191
+ }
192
+ return Ok(ExtractionResult {
193
+ content: String::new(),
194
+ mime_type: mime_type.to_string().into(),
195
+ metadata: Metadata::default(),
196
+ tables: vec![],
197
+ detected_languages: None,
198
+ chunks: None,
199
+ images: None,
200
+ pages: None,
201
+ djot_content: None,
202
+ elements: None,
203
+ });
204
+ }
205
+ }
155
206
  } else {
156
207
  let pdfium =
157
208
  crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
158
209
 
159
- let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
160
- let err_msg = crate::pdf::error::format_pdfium_error(e);
161
- if err_msg.contains("password") || err_msg.contains("Password") {
162
- PdfError::PasswordRequired
163
- } else {
164
- PdfError::InvalidPdf(err_msg)
210
+ let document = match pdfium.load_pdf_from_byte_slice(content, None) {
211
+ Ok(doc) => doc,
212
+ Err(e) => {
213
+ let err_msg = crate::pdf::error::format_pdfium_error(e);
214
+ if err_msg.contains("password") || err_msg.contains("Password") {
215
+ #[cfg(feature = "otel")]
216
+ tracing::warn!("Password-protected PDF encountered, returning empty result");
217
+ } else {
218
+ #[cfg(feature = "otel")]
219
+ tracing::warn!("Malformed or invalid PDF encountered: {}", err_msg);
220
+ }
221
+ return Ok(ExtractionResult {
222
+ content: String::new(),
223
+ mime_type: mime_type.to_string().into(),
224
+ metadata: Metadata::default(),
225
+ tables: vec![],
226
+ detected_languages: None,
227
+ chunks: None,
228
+ images: None,
229
+ pages: None,
230
+ djot_content: None,
231
+ elements: None,
232
+ });
165
233
  }
166
- })?;
234
+ };
167
235
 
168
236
  extract_all_from_document(&document, config)?
169
237
  }
@@ -173,14 +241,31 @@ impl DocumentExtractor for PdfExtractor {
173
241
  let pdfium =
174
242
  crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
175
243
 
176
- let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
177
- let err_msg = crate::pdf::error::format_pdfium_error(e);
178
- if err_msg.contains("password") || err_msg.contains("Password") {
179
- PdfError::PasswordRequired
180
- } else {
181
- PdfError::InvalidPdf(err_msg)
244
+ let document = match pdfium.load_pdf_from_byte_slice(content, None) {
245
+ Ok(doc) => doc,
246
+ Err(e) => {
247
+ let err_msg = crate::pdf::error::format_pdfium_error(e);
248
+ if err_msg.contains("password") || err_msg.contains("Password") {
249
+ #[cfg(feature = "otel")]
250
+ tracing::warn!("Password-protected PDF encountered, returning empty result");
251
+ } else {
252
+ #[cfg(feature = "otel")]
253
+ tracing::warn!("Malformed or invalid PDF encountered: {}", err_msg);
254
+ }
255
+ return Ok(ExtractionResult {
256
+ content: String::new(),
257
+ mime_type: mime_type.to_string().into(),
258
+ metadata: Metadata::default(),
259
+ tables: vec![],
260
+ detected_languages: None,
261
+ chunks: None,
262
+ images: None,
263
+ pages: None,
264
+ djot_content: None,
265
+ elements: None,
266
+ });
182
267
  }
183
- })?;
268
+ };
184
269
 
185
270
  extract_all_from_document(&document, config)?
186
271
  }
@@ -628,4 +713,52 @@ mod tests {
628
713
  let extractor = PdfExtractor::new();
629
714
  assert_eq!(extractor.name(), "pdf-extractor");
630
715
  }
716
+
717
+ #[tokio::test]
718
+ #[cfg(feature = "pdf")]
719
+ async fn test_pdf_gracefully_handles_malformed_pdf() {
720
+ let extractor = PdfExtractor::new();
721
+ let config = ExtractionConfig::default();
722
+
723
+ // Create a malformed PDF: just some random bytes that start with %PDF but are incomplete
724
+ let malformed_pdf = b"%PDF-1.4\nmalformed content that is not a valid PDF".to_vec();
725
+
726
+ let result = extractor
727
+ .extract_bytes(&malformed_pdf, "application/pdf", &config)
728
+ .await;
729
+
730
+ assert!(
731
+ result.is_ok(),
732
+ "Malformed PDF should be handled gracefully, not return an error"
733
+ );
734
+
735
+ let extraction_result = result.unwrap();
736
+ assert_eq!(extraction_result.content, "", "Malformed PDF should have empty content");
737
+ assert_eq!(extraction_result.tables.len(), 0, "Malformed PDF should have no tables");
738
+ assert_eq!(
739
+ extraction_result.mime_type.as_ref() as &str,
740
+ "application/pdf",
741
+ "MIME type should be preserved"
742
+ );
743
+ }
744
+
745
+ #[tokio::test]
746
+ #[cfg(feature = "pdf")]
747
+ async fn test_pdf_gracefully_handles_invalid_bytes() {
748
+ let extractor = PdfExtractor::new();
749
+ let config = ExtractionConfig::default();
750
+
751
+ // Create completely invalid data (not a PDF at all)
752
+ let invalid_data = b"This is not a PDF file, just random text".to_vec();
753
+
754
+ let result = extractor.extract_bytes(&invalid_data, "application/pdf", &config).await;
755
+
756
+ assert!(
757
+ result.is_ok(),
758
+ "Invalid PDF data should be handled gracefully, not return an error"
759
+ );
760
+
761
+ let extraction_result = result.unwrap();
762
+ assert_eq!(extraction_result.content, "", "Invalid PDF should have empty content");
763
+ }
631
764
  }
@@ -254,14 +254,17 @@ pub(crate) fn bind_pdfium(
254
254
  mod tests {
255
255
  use super::*;
256
256
  use crate::pdf::error::PdfError;
257
+ use serial_test::serial;
257
258
 
258
259
  #[test]
260
+ #[serial]
259
261
  fn test_bind_pdfium_lazy_initialization() {
260
262
  let result = bind_pdfium(PdfError::TextExtractionFailed, "test context");
261
263
  assert!(result.is_ok(), "First bind_pdfium call should succeed");
262
264
  }
263
265
 
264
266
  #[test]
267
+ #[serial]
265
268
  fn test_bind_pdfium_multiple_calls() {
266
269
  // First call - acquire lock, test success, then drop handle to release lock
267
270
  {
@@ -277,6 +280,7 @@ mod tests {
277
280
  }
278
281
 
279
282
  #[test]
283
+ #[serial]
280
284
  fn test_bind_pdfium_returns_same_instance() {
281
285
  // Get pointer from first handle, then drop it to release lock
282
286
  let ptr1 = {
@@ -295,6 +299,7 @@ mod tests {
295
299
  }
296
300
 
297
301
  #[test]
302
+ #[serial]
298
303
  fn test_bind_pdfium_error_mapping() {
299
304
  let map_err = |msg: String| PdfError::TextExtractionFailed(msg);
300
305
 
@@ -308,6 +313,7 @@ mod tests {
308
313
  }
309
314
 
310
315
  #[test]
316
+ #[serial]
311
317
  fn test_pdfium_handle_deref() {
312
318
  let handle = bind_pdfium(PdfError::TextExtractionFailed, "test").unwrap();
313
319
 
@@ -486,6 +486,7 @@ mod tests {
486
486
  }
487
487
 
488
488
  #[test]
489
+ #[serial_test::serial]
489
490
  fn test_extract_metadata_invalid_pdf() {
490
491
  let result = extract_metadata(b"not a pdf");
491
492
  assert!(result.is_err());
@@ -169,14 +169,17 @@ fn calculate_optimal_dpi(
169
169
  #[cfg(test)]
170
170
  mod tests {
171
171
  use super::*;
172
+ use serial_test::serial;
172
173
 
173
174
  #[test]
175
+ #[serial]
174
176
  fn test_renderer_creation() {
175
177
  let result = PdfRenderer::new();
176
178
  assert!(result.is_ok());
177
179
  }
178
180
 
179
181
  #[test]
182
+ #[serial]
180
183
  fn test_render_invalid_pdf() {
181
184
  let renderer = PdfRenderer::new().unwrap();
182
185
  let options = PageRenderOptions::default();
@@ -186,6 +189,7 @@ mod tests {
186
189
  }
187
190
 
188
191
  #[test]
192
+ #[serial]
189
193
  fn test_render_page_not_found() {
190
194
  let renderer = PdfRenderer::new().unwrap();
191
195
  let options = PageRenderOptions::default();
@@ -242,6 +246,7 @@ mod tests {
242
246
  }
243
247
 
244
248
  #[test]
249
+ #[serial]
245
250
  fn test_render_all_pages_empty_pdf() {
246
251
  let renderer = PdfRenderer::new().unwrap();
247
252
  let options = PageRenderOptions::default();
@@ -250,6 +255,7 @@ mod tests {
250
255
  }
251
256
 
252
257
  #[test]
258
+ #[serial]
253
259
  fn test_render_page_with_password_none() {
254
260
  let renderer = PdfRenderer::new().unwrap();
255
261
  let options = PageRenderOptions::default();
@@ -258,6 +264,7 @@ mod tests {
258
264
  }
259
265
 
260
266
  #[test]
267
+ #[serial]
261
268
  fn test_render_all_pages_with_password_none() {
262
269
  let renderer = PdfRenderer::new().unwrap();
263
270
  let options = PageRenderOptions::default();
@@ -266,6 +273,7 @@ mod tests {
266
273
  }
267
274
 
268
275
  #[test]
276
+ #[serial]
269
277
  fn test_render_page_to_image_function() {
270
278
  let options = PageRenderOptions::default();
271
279
  let result = render_page_to_image(b"not a pdf", 0, &options);
@@ -348,6 +356,7 @@ mod tests {
348
356
  }
349
357
 
350
358
  #[test]
359
+ #[serial]
351
360
  fn test_render_empty_bytes() {
352
361
  let renderer = PdfRenderer::new().unwrap();
353
362
  let options = PageRenderOptions::default();
@@ -441,14 +441,17 @@ fn extract_page_hierarchy(
441
441
  #[cfg(test)]
442
442
  mod tests {
443
443
  use super::*;
444
+ use serial_test::serial;
444
445
 
445
446
  #[test]
447
+ #[serial]
446
448
  fn test_extractor_creation() {
447
449
  let result = PdfTextExtractor::new();
448
450
  assert!(result.is_ok());
449
451
  }
450
452
 
451
453
  #[test]
454
+ #[serial]
452
455
  fn test_extract_empty_pdf() {
453
456
  let extractor = PdfTextExtractor::new().unwrap();
454
457
  let result = extractor.extract_text(b"");
@@ -456,6 +459,7 @@ mod tests {
456
459
  }
457
460
 
458
461
  #[test]
462
+ #[serial]
459
463
  fn test_extract_invalid_pdf() {
460
464
  let extractor = PdfTextExtractor::new().unwrap();
461
465
  let result = extractor.extract_text(b"not a pdf");
@@ -464,6 +468,7 @@ mod tests {
464
468
  }
465
469
 
466
470
  #[test]
471
+ #[serial]
467
472
  fn test_password_required_detection() {
468
473
  let extractor = PdfTextExtractor::new().unwrap();
469
474
  let encrypted_pdf = b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n";
@@ -475,6 +480,7 @@ mod tests {
475
480
  }
476
481
 
477
482
  #[test]
483
+ #[serial]
478
484
  fn test_extract_text_with_passwords_empty_list() {
479
485
  let extractor = PdfTextExtractor::new().unwrap();
480
486
  let result = extractor.extract_text_with_passwords(b"not a pdf", &[]);
@@ -485,6 +491,7 @@ mod tests {
485
491
  #[cfg(test)]
486
492
  mod cache_regression_tests {
487
493
  use super::*;
494
+ use serial_test::serial;
488
495
  use std::time::Instant;
489
496
 
490
497
  /// Test that multiple extractions of the same document produce consistent results.
@@ -497,6 +504,7 @@ mod cache_regression_tests {
497
504
  /// 1. Multiple extractions produce identical text content
498
505
  /// 2. The singleton pattern provides consistent extraction behavior
499
506
  #[test]
507
+ #[serial]
500
508
  fn test_no_global_cache_between_documents() {
501
509
  let pdf_bytes = std::fs::read("../../test_documents/pdf/fake_memo.pdf").expect("Failed to read PDF");
502
510
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.13"
3
+ version = "4.2.14"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.13
4
+ version: 4.2.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld