kreuzberg 4.2.12 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +2 -2
  7. data/vendor/kreuzberg/Cargo.toml +24 -7
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
  10. data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
  11. data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
  12. data/vendor/kreuzberg/src/core/mime.rs +47 -2
  13. data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
  14. data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
  15. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
  16. data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
  17. data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
  18. data/vendor/kreuzberg/src/extraction/image.rs +405 -18
  19. data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
  20. data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
  21. data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
  22. data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
  23. data/vendor/kreuzberg/src/extractors/image.rs +25 -0
  24. data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
  25. data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
  26. data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
  27. data/vendor/kreuzberg/src/extractors/security.rs +2 -1
  28. data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
  29. data/vendor/kreuzberg/src/extractors/text.rs +33 -4
  30. data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
  31. data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
  32. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  33. metadata +4 -2
@@ -5,14 +5,17 @@
5
5
  //! - ZIP archives
6
6
  //! - TAR archives (including compressed TAR.GZ, TAR.BZ2)
7
7
  //! - 7Z archives
8
+ //! - GZIP archives
8
9
  //!
9
10
  //! Each format has its own submodule with specialized extraction logic.
10
11
 
12
+ mod gzip;
11
13
  mod sevenz;
12
14
  mod tar;
13
15
  mod zip;
14
16
 
15
17
  // Re-export all public functions for backward compatibility
18
+ pub use gzip::{decompress_gzip, extract_gzip, extract_gzip_metadata, extract_gzip_text_content};
16
19
  pub use sevenz::{extract_7z_metadata, extract_7z_text_content};
17
20
  pub use tar::{extract_tar_metadata, extract_tar_text_content};
18
21
  pub use zip::{extract_zip_metadata, extract_zip_text_content};
@@ -49,10 +52,15 @@ pub(crate) const TEXT_EXTENSIONS: &[&str] = &[
49
52
  #[cfg(test)]
50
53
  mod tests {
51
54
  use super::*;
55
+ use crate::extractors::security::SecurityLimits;
52
56
  use ::tar::Builder as TarBuilder;
53
57
  use ::zip::write::{FileOptions, ZipWriter};
54
58
  use std::io::{Cursor, Write};
55
59
 
60
+ fn default_limits() -> SecurityLimits {
61
+ SecurityLimits::default()
62
+ }
63
+
56
64
  #[test]
57
65
  fn test_extract_zip_metadata() {
58
66
  let mut cursor = Cursor::new(Vec::new());
@@ -70,7 +78,7 @@ mod tests {
70
78
  }
71
79
 
72
80
  let bytes = cursor.into_inner();
73
- let metadata = extract_zip_metadata(&bytes).unwrap();
81
+ let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
74
82
 
75
83
  assert_eq!(metadata.format, "ZIP");
76
84
  assert_eq!(metadata.file_count, 2);
@@ -102,7 +110,7 @@ mod tests {
102
110
  }
103
111
 
104
112
  let bytes = cursor.into_inner();
105
- let metadata = extract_tar_metadata(&bytes).unwrap();
113
+ let metadata = extract_tar_metadata(&bytes, &default_limits()).unwrap();
106
114
 
107
115
  assert_eq!(metadata.format, "TAR");
108
116
  assert_eq!(metadata.file_count, 2);
@@ -127,7 +135,7 @@ mod tests {
127
135
  }
128
136
 
129
137
  let bytes = cursor.into_inner();
130
- let contents = extract_zip_text_content(&bytes).unwrap();
138
+ let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
131
139
 
132
140
  assert_eq!(contents.len(), 2);
133
141
  assert_eq!(contents.get("test.txt").unwrap(), "Hello, World!");
@@ -158,7 +166,7 @@ mod tests {
158
166
  }
159
167
 
160
168
  let bytes = cursor.into_inner();
161
- let contents = extract_tar_text_content(&bytes).unwrap();
169
+ let contents = extract_tar_text_content(&bytes, &default_limits()).unwrap();
162
170
 
163
171
  assert_eq!(contents.len(), 2);
164
172
  assert_eq!(contents.get("test.txt").unwrap(), "Hello, World!");
@@ -168,14 +176,14 @@ mod tests {
168
176
  #[test]
169
177
  fn test_extract_zip_metadata_invalid() {
170
178
  let invalid_bytes = vec![0, 1, 2, 3, 4, 5];
171
- let result = extract_zip_metadata(&invalid_bytes);
179
+ let result = extract_zip_metadata(&invalid_bytes, &default_limits());
172
180
  assert!(result.is_err());
173
181
  }
174
182
 
175
183
  #[test]
176
184
  fn test_extract_tar_metadata_invalid() {
177
185
  let invalid_bytes = vec![0, 1, 2, 3, 4, 5];
178
- let result = extract_tar_metadata(&invalid_bytes);
186
+ let result = extract_tar_metadata(&invalid_bytes, &default_limits());
179
187
  assert!(result.is_err());
180
188
  }
181
189
 
@@ -199,7 +207,7 @@ mod tests {
199
207
  }
200
208
 
201
209
  let bytes = cursor.into_inner();
202
- let metadata = extract_zip_metadata(&bytes).unwrap();
210
+ let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
203
211
 
204
212
  assert_eq!(metadata.format, "ZIP");
205
213
  assert_eq!(metadata.file_count, 4);
@@ -233,7 +241,7 @@ mod tests {
233
241
  }
234
242
 
235
243
  let bytes = cursor.into_inner();
236
- let metadata = extract_tar_metadata(&bytes).unwrap();
244
+ let metadata = extract_tar_metadata(&bytes, &default_limits()).unwrap();
237
245
 
238
246
  assert_eq!(metadata.format, "TAR");
239
247
  assert_eq!(metadata.file_count, 2);
@@ -258,7 +266,7 @@ mod tests {
258
266
  tar.finish().unwrap();
259
267
  }
260
268
 
261
- let metadata = extract_tar_metadata(&tar_data).unwrap();
269
+ let metadata = extract_tar_metadata(&tar_data, &default_limits()).unwrap();
262
270
  assert_eq!(metadata.format, "TAR");
263
271
  assert_eq!(metadata.file_count, 1);
264
272
  assert_eq!(metadata.file_list[0].path, "test.txt");
@@ -288,7 +296,7 @@ mod tests {
288
296
  };
289
297
 
290
298
  let bytes = cursor.into_inner();
291
- let metadata = extract_7z_metadata(&bytes).unwrap();
299
+ let metadata = extract_7z_metadata(&bytes, &default_limits()).unwrap();
292
300
 
293
301
  assert_eq!(metadata.format, "7Z");
294
302
  assert_eq!(metadata.file_count, 2);
@@ -324,7 +332,7 @@ mod tests {
324
332
  }
325
333
 
326
334
  let outer_bytes = outer_cursor.into_inner();
327
- let metadata = extract_zip_metadata(&outer_bytes).unwrap();
335
+ let metadata = extract_zip_metadata(&outer_bytes, &default_limits()).unwrap();
328
336
 
329
337
  assert_eq!(metadata.file_count, 2);
330
338
 
@@ -371,7 +379,7 @@ mod tests {
371
379
  }
372
380
 
373
381
  let outer_bytes = outer_cursor.into_inner();
374
- let metadata = extract_tar_metadata(&outer_bytes).unwrap();
382
+ let metadata = extract_tar_metadata(&outer_bytes, &default_limits()).unwrap();
375
383
 
376
384
  assert_eq!(metadata.file_count, 2);
377
385
 
@@ -397,7 +405,7 @@ mod tests {
397
405
  let mut corrupted = valid_cursor.into_inner();
398
406
  corrupted.truncate(corrupted.len() / 2);
399
407
 
400
- let result = extract_zip_metadata(&corrupted);
408
+ let result = extract_zip_metadata(&corrupted, &default_limits());
401
409
  assert!(result.is_err());
402
410
 
403
411
  if let Err(e) = result {
@@ -424,7 +432,7 @@ mod tests {
424
432
  let mut corrupted = valid_cursor.into_inner();
425
433
  corrupted[100] = 0xFF;
426
434
 
427
- let result = extract_tar_metadata(&corrupted);
435
+ let result = extract_tar_metadata(&corrupted, &default_limits());
428
436
  assert!(result.is_err());
429
437
  }
430
438
 
@@ -437,7 +445,7 @@ mod tests {
437
445
  }
438
446
 
439
447
  let bytes = cursor.into_inner();
440
- let metadata = extract_zip_metadata(&bytes).unwrap();
448
+ let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
441
449
 
442
450
  assert_eq!(metadata.format, "ZIP");
443
451
  assert_eq!(metadata.file_count, 0);
@@ -454,7 +462,7 @@ mod tests {
454
462
  }
455
463
 
456
464
  let bytes = cursor.into_inner();
457
- let metadata = extract_tar_metadata(&bytes).unwrap();
465
+ let metadata = extract_tar_metadata(&bytes, &default_limits()).unwrap();
458
466
 
459
467
  assert_eq!(metadata.format, "TAR");
460
468
  assert_eq!(metadata.file_count, 0);
@@ -485,7 +493,7 @@ mod tests {
485
493
  }
486
494
 
487
495
  let bytes = cursor.into_inner();
488
- let contents = extract_zip_text_content(&bytes).unwrap();
496
+ let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
489
497
 
490
498
  assert_eq!(contents.len(), 3);
491
499
  assert_eq!(contents.get("file1.txt").unwrap(), "Content 1");
@@ -519,7 +527,7 @@ mod tests {
519
527
  }
520
528
 
521
529
  let bytes = cursor.into_inner();
522
- let contents = extract_tar_text_content(&bytes).unwrap();
530
+ let contents = extract_tar_text_content(&bytes, &default_limits()).unwrap();
523
531
 
524
532
  assert_eq!(contents.len(), 4);
525
533
  assert_eq!(contents.get("file1.txt").unwrap(), "Content 1");
@@ -552,7 +560,7 @@ mod tests {
552
560
  }
553
561
 
554
562
  let bytes = cursor.into_inner();
555
- let metadata = extract_zip_metadata(&bytes).unwrap();
563
+ let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
556
564
 
557
565
  let paths: Vec<&str> = metadata.file_list.iter().map(|e| e.path.as_str()).collect();
558
566
  assert!(paths.contains(&"root/"));
@@ -579,12 +587,12 @@ mod tests {
579
587
  }
580
588
 
581
589
  let bytes = cursor.into_inner();
582
- let metadata = extract_zip_metadata(&bytes).unwrap();
590
+ let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
583
591
 
584
592
  assert_eq!(metadata.file_count, 1);
585
593
  assert_eq!(metadata.total_size, 10_000);
586
594
 
587
- let contents = extract_zip_text_content(&bytes).unwrap();
595
+ let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
588
596
  assert_eq!(contents.get("large.txt").unwrap().len(), 10_000);
589
597
  }
590
598
 
@@ -607,12 +615,12 @@ mod tests {
607
615
  }
608
616
 
609
617
  let bytes = cursor.into_inner();
610
- let metadata = extract_zip_metadata(&bytes).unwrap();
618
+ let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
611
619
 
612
620
  assert_eq!(metadata.file_count, 100);
613
621
  assert_eq!(metadata.file_list.len(), 100);
614
622
 
615
- let contents = extract_zip_text_content(&bytes).unwrap();
623
+ let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
616
624
  assert_eq!(contents.len(), 100);
617
625
  }
618
626
 
@@ -632,12 +640,12 @@ mod tests {
632
640
  }
633
641
 
634
642
  let bytes = cursor.into_inner();
635
- let metadata = extract_zip_metadata(&bytes).unwrap();
643
+ let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
636
644
 
637
645
  assert_eq!(metadata.file_count, 1);
638
646
  assert!(metadata.file_list[0].path.len() > 200);
639
647
 
640
- let contents = extract_zip_text_content(&bytes).unwrap();
648
+ let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
641
649
  assert_eq!(contents.len(), 1);
642
650
  }
643
651
 
@@ -665,7 +673,7 @@ mod tests {
665
673
  };
666
674
 
667
675
  let bytes = cursor.into_inner();
668
- let contents = extract_7z_text_content(&bytes).unwrap();
676
+ let contents = extract_7z_text_content(&bytes, &default_limits()).unwrap();
669
677
 
670
678
  assert_eq!(contents.len(), 2);
671
679
  assert_eq!(contents.get("test.txt").unwrap(), "Hello 7z text!");
@@ -683,7 +691,7 @@ mod tests {
683
691
  };
684
692
 
685
693
  let bytes = cursor.into_inner();
686
- let metadata = extract_7z_metadata(&bytes).unwrap();
694
+ let metadata = extract_7z_metadata(&bytes, &default_limits()).unwrap();
687
695
 
688
696
  assert_eq!(metadata.format, "7Z");
689
697
  assert_eq!(metadata.file_count, 0);
@@ -708,12 +716,12 @@ mod tests {
708
716
  }
709
717
 
710
718
  let bytes = cursor.into_inner();
711
- let metadata = extract_tar_metadata(&bytes).unwrap();
719
+ let metadata = extract_tar_metadata(&bytes, &default_limits()).unwrap();
712
720
 
713
721
  assert_eq!(metadata.file_count, 1);
714
722
  assert_eq!(metadata.total_size, 50_000);
715
723
 
716
- let contents = extract_tar_text_content(&bytes).unwrap();
724
+ let contents = extract_tar_text_content(&bytes, &default_limits()).unwrap();
717
725
  assert_eq!(contents.get("large.txt").unwrap().len(), 50_000);
718
726
  }
719
727
 
@@ -740,7 +748,7 @@ mod tests {
740
748
  }
741
749
 
742
750
  let bytes = cursor.into_inner();
743
- let contents = extract_zip_text_content(&bytes).unwrap();
751
+ let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
744
752
 
745
753
  assert_eq!(contents.len(), 2);
746
754
  assert!(contents.contains_key("document.txt"));
@@ -755,11 +763,119 @@ mod tests {
755
763
 
756
764
  let invalid_7z_data = vec![0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C, 0x00];
757
765
 
758
- let result = extract_7z_metadata(&invalid_7z_data);
766
+ let result = extract_7z_metadata(&invalid_7z_data, &default_limits());
759
767
  assert!(result.is_err());
760
768
 
761
769
  if let Err(e) = result {
762
770
  assert!(matches!(e, KreuzbergError::Parsing { .. }));
763
771
  }
764
772
  }
773
+
774
+ #[test]
775
+ fn test_extract_gzip_metadata() {
776
+ use flate2::Compression;
777
+ use flate2::write::GzEncoder;
778
+ use std::io::Write;
779
+
780
+ let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
781
+ encoder.write_all(b"Hello from gzip!").unwrap();
782
+ let compressed = encoder.finish().unwrap();
783
+
784
+ let metadata = extract_gzip_metadata(&compressed, &default_limits()).unwrap();
785
+ assert_eq!(metadata.format, "GZIP");
786
+ assert_eq!(metadata.file_count, 1);
787
+ assert_eq!(metadata.total_size, 16);
788
+ }
789
+
790
+ #[test]
791
+ fn test_extract_gzip_text_content() {
792
+ use flate2::Compression;
793
+ use flate2::write::GzEncoder;
794
+ use std::io::Write;
795
+
796
+ let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
797
+ encoder.write_all(b"Hello from gzip!").unwrap();
798
+ let compressed = encoder.finish().unwrap();
799
+
800
+ let contents = extract_gzip_text_content(&compressed, &default_limits()).unwrap();
801
+ assert_eq!(contents.len(), 1);
802
+ assert!(contents.values().next().unwrap().contains("Hello from gzip!"));
803
+ }
804
+
805
+ #[test]
806
+ fn test_decompress_gzip() {
807
+ use flate2::Compression;
808
+ use flate2::write::GzEncoder;
809
+ use std::io::Write;
810
+
811
+ let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
812
+ encoder.write_all(b"test content").unwrap();
813
+ let compressed = encoder.finish().unwrap();
814
+
815
+ let decompressed = decompress_gzip(&compressed, &default_limits()).unwrap();
816
+ assert_eq!(String::from_utf8(decompressed).unwrap(), "test content");
817
+ }
818
+
819
+ #[test]
820
+ fn test_extract_gzip_invalid_data() {
821
+ let invalid = vec![0, 1, 2, 3, 4, 5];
822
+ let result = extract_gzip_metadata(&invalid, &default_limits());
823
+ assert!(result.is_err());
824
+ }
825
+
826
+ #[test]
827
+ fn test_extract_gzip_empty_content() {
828
+ use flate2::Compression;
829
+ use flate2::write::GzEncoder;
830
+
831
+ let encoder = GzEncoder::new(Vec::new(), Compression::default());
832
+ let compressed = encoder.finish().unwrap();
833
+
834
+ let metadata = extract_gzip_metadata(&compressed, &default_limits()).unwrap();
835
+ assert_eq!(metadata.format, "GZIP");
836
+ assert_eq!(metadata.total_size, 0);
837
+ }
838
+
839
+ #[test]
840
+ fn test_zip_too_many_files_rejected() {
841
+ let mut cursor = Cursor::new(Vec::new());
842
+ {
843
+ let mut zip = ZipWriter::new(&mut cursor);
844
+ let options = FileOptions::<'_, ()>::default();
845
+
846
+ for i in 0..5 {
847
+ let filename = format!("file_{}.txt", i);
848
+ zip.start_file(&filename, options).unwrap();
849
+ zip.write_all(b"content").unwrap();
850
+ }
851
+ zip.finish().unwrap();
852
+ }
853
+
854
+ let bytes = cursor.into_inner();
855
+ let limits = SecurityLimits {
856
+ max_files_in_archive: 3,
857
+ ..SecurityLimits::default()
858
+ };
859
+ let result = extract_zip_metadata(&bytes, &limits);
860
+ assert!(result.is_err());
861
+ }
862
+
863
+ #[test]
864
+ fn test_gzip_bomb_rejected() {
865
+ use flate2::Compression;
866
+ use flate2::write::GzEncoder;
867
+ use std::io::Write;
868
+
869
+ // Create data that exceeds a tiny limit
870
+ let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
871
+ encoder.write_all(&[b'A'; 1024]).unwrap();
872
+ let compressed = encoder.finish().unwrap();
873
+
874
+ let limits = SecurityLimits {
875
+ max_archive_size: 100, // 100 bytes limit
876
+ ..SecurityLimits::default()
877
+ };
878
+ let result = extract_gzip_metadata(&compressed, &limits);
879
+ assert!(result.is_err());
880
+ }
765
881
  }
@@ -4,6 +4,7 @@
4
4
 
5
5
  use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
6
6
  use crate::error::{KreuzbergError, Result};
7
+ use crate::extractors::security::SecurityLimits;
7
8
  use sevenz_rust2::{ArchiveReader, Password};
8
9
  use std::collections::HashMap;
9
10
  use std::io::Cursor;
@@ -13,6 +14,7 @@ use std::io::Cursor;
13
14
  /// # Arguments
14
15
  ///
15
16
  /// * `bytes` - The 7z archive bytes
17
+ /// * `limits` - Security limits for archive extraction
16
18
  ///
17
19
  /// # Returns
18
20
  ///
@@ -24,8 +26,9 @@ use std::io::Cursor;
24
26
  ///
25
27
  /// # Errors
26
28
  ///
27
- /// Returns an error if the 7z archive cannot be read or parsed.
28
- pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
29
+ /// Returns an error if the 7z archive cannot be read or parsed,
30
+ /// or if security limits are exceeded.
31
+ pub fn extract_7z_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
29
32
  let cursor = Cursor::new(bytes);
30
33
  let archive = ArchiveReader::new(cursor, Password::empty())
31
34
  .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
@@ -33,7 +36,16 @@ pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
33
36
  let mut file_list = Vec::new();
34
37
  let mut total_size = 0u64;
35
38
 
36
- for entry in &archive.archive().files {
39
+ let files = &archive.archive().files;
40
+ if files.len() > limits.max_files_in_archive {
41
+ return Err(KreuzbergError::validation(format!(
42
+ "7z archive has too many files: {} (max: {})",
43
+ files.len(),
44
+ limits.max_files_in_archive
45
+ )));
46
+ }
47
+
48
+ for entry in files {
37
49
  let path = entry.name().to_string();
38
50
  let size = entry.size();
39
51
  let is_dir = entry.is_directory();
@@ -42,6 +54,13 @@ pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
42
54
  total_size += size;
43
55
  }
44
56
 
57
+ if total_size > limits.max_archive_size as u64 {
58
+ return Err(KreuzbergError::validation(format!(
59
+ "7z archive total uncompressed size exceeds limit: {} bytes (max: {} bytes)",
60
+ total_size, limits.max_archive_size
61
+ )));
62
+ }
63
+
45
64
  file_list.push(ArchiveEntry { path, size, is_dir });
46
65
  }
47
66
 
@@ -71,12 +90,22 @@ pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
71
90
  /// # Errors
72
91
  ///
73
92
  /// Returns an error if the 7z archive cannot be read or parsed.
74
- pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
93
+ pub fn extract_7z_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
75
94
  let cursor = Cursor::new(bytes);
76
95
  let mut archive = ArchiveReader::new(cursor, Password::empty())
77
96
  .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
78
97
 
98
+ let file_count = archive.archive().files.len();
99
+ if file_count > limits.max_files_in_archive {
100
+ return Err(KreuzbergError::validation(format!(
101
+ "7z archive has too many files: {} (max: {})",
102
+ file_count, limits.max_files_in_archive
103
+ )));
104
+ }
105
+
79
106
  let mut contents = HashMap::new();
107
+ let max_content_size = limits.max_content_size;
108
+ let mut total_content_size = 0usize;
80
109
 
81
110
  archive
82
111
  .for_each_entries(|entry, reader| {
@@ -87,6 +116,10 @@ pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
87
116
  if let Ok(_) = reader.read_to_end(&mut content)
88
117
  && let Ok(text) = String::from_utf8(content)
89
118
  {
119
+ total_content_size = total_content_size.saturating_add(text.len());
120
+ if total_content_size > max_content_size {
121
+ return Ok(false);
122
+ }
90
123
  contents.insert(path, text);
91
124
  }
92
125
  }
@@ -94,5 +127,12 @@ pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
94
127
  })
95
128
  .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z entries: {}", e)))?;
96
129
 
130
+ if total_content_size > max_content_size {
131
+ return Err(KreuzbergError::validation(format!(
132
+ "7z archive text content exceeds limit: {} bytes (max: {} bytes)",
133
+ total_content_size, max_content_size
134
+ )));
135
+ }
136
+
97
137
  Ok(contents)
98
138
  }
@@ -5,6 +5,7 @@
5
5
 
6
6
  use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
7
7
  use crate::error::{KreuzbergError, Result};
8
+ use crate::extractors::security::SecurityLimits;
8
9
  use std::collections::HashMap;
9
10
  use std::io::{Cursor, Read};
10
11
  use tar::Archive as TarArchive;
@@ -14,6 +15,7 @@ use tar::Archive as TarArchive;
14
15
  /// # Arguments
15
16
  ///
16
17
  /// * `bytes` - The TAR archive bytes (can be compressed with gzip or bzip2)
18
+ /// * `limits` - Security limits for archive extraction
17
19
  ///
18
20
  /// # Returns
19
21
  ///
@@ -25,8 +27,9 @@ use tar::Archive as TarArchive;
25
27
  ///
26
28
  /// # Errors
27
29
  ///
28
- /// Returns an error if the TAR archive cannot be read or parsed.
29
- pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
30
+ /// Returns an error if the TAR archive cannot be read or parsed,
31
+ /// or if security limits are exceeded.
32
+ pub fn extract_tar_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
30
33
  let cursor = Cursor::new(bytes);
31
34
  let mut archive = TarArchive::new(cursor);
32
35
 
@@ -56,6 +59,21 @@ pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
56
59
  }
57
60
 
58
61
  file_count += 1;
62
+
63
+ if file_count > limits.max_files_in_archive {
64
+ return Err(KreuzbergError::validation(format!(
65
+ "TAR archive has too many files: {} (max: {})",
66
+ file_count, limits.max_files_in_archive
67
+ )));
68
+ }
69
+
70
+ if total_size > limits.max_archive_size as u64 {
71
+ return Err(KreuzbergError::validation(format!(
72
+ "TAR archive total uncompressed size exceeds limit: {} bytes (max: {} bytes)",
73
+ total_size, limits.max_archive_size
74
+ )));
75
+ }
76
+
59
77
  file_list.push(ArchiveEntry { path, size, is_dir });
60
78
  }
61
79
 
@@ -83,12 +101,14 @@ pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
83
101
  /// # Errors
84
102
  ///
85
103
  /// Returns an error if the TAR archive cannot be read or parsed.
86
- pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
104
+ pub fn extract_tar_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
87
105
  let cursor = Cursor::new(bytes);
88
106
  let mut archive = TarArchive::new(cursor);
89
107
 
90
108
  let estimated_text_files = bytes.len().saturating_div(1024 * 10).min(100);
91
109
  let mut contents = HashMap::with_capacity(estimated_text_files.max(2));
110
+ let mut file_count = 0usize;
111
+ let mut total_content_size = 0usize;
92
112
 
93
113
  let entries = archive
94
114
  .entries()
@@ -98,6 +118,14 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
98
118
  let mut entry =
99
119
  entry_result.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry: {}", e)))?;
100
120
 
121
+ file_count += 1;
122
+ if file_count > limits.max_files_in_archive {
123
+ return Err(KreuzbergError::validation(format!(
124
+ "TAR archive has too many files: {} (max: {})",
125
+ file_count, limits.max_files_in_archive
126
+ )));
127
+ }
128
+
101
129
  let path = entry
102
130
  .path()
103
131
  .map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry path: {}", e)))?
@@ -109,6 +137,13 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
109
137
  let estimated_size = (entry.size().min(10 * 1024 * 1024)) as usize;
110
138
  let mut content = String::with_capacity(estimated_size);
111
139
  if entry.read_to_string(&mut content).is_ok() {
140
+ total_content_size = total_content_size.saturating_add(content.len());
141
+ if total_content_size > limits.max_content_size {
142
+ return Err(KreuzbergError::validation(format!(
143
+ "TAR archive text content exceeds limit: {} bytes (max: {} bytes)",
144
+ total_content_size, limits.max_content_size
145
+ )));
146
+ }
112
147
  contents.insert(path, content);
113
148
  }
114
149
  }
@@ -4,6 +4,7 @@
4
4
 
5
5
  use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
6
6
  use crate::error::{KreuzbergError, Result};
7
+ use crate::extractors::security::SecurityLimits;
7
8
  use std::collections::HashMap;
8
9
  use std::io::{Cursor, Read};
9
10
  use zip::ZipArchive;
@@ -13,6 +14,7 @@ use zip::ZipArchive;
13
14
  /// # Arguments
14
15
  ///
15
16
  /// * `bytes` - The ZIP archive bytes
17
+ /// * `limits` - Security limits for archive extraction
16
18
  ///
17
19
  /// # Returns
18
20
  ///
@@ -24,12 +26,21 @@ use zip::ZipArchive;
24
26
  ///
25
27
  /// # Errors
26
28
  ///
27
- /// Returns an error if the ZIP archive cannot be read or parsed.
28
- pub fn extract_zip_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
29
+ /// Returns an error if the ZIP archive cannot be read or parsed,
30
+ /// or if security limits are exceeded.
31
+ pub fn extract_zip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
29
32
  let cursor = Cursor::new(bytes);
30
33
  let mut archive =
31
34
  ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
32
35
 
36
+ if archive.len() > limits.max_files_in_archive {
37
+ return Err(KreuzbergError::validation(format!(
38
+ "ZIP archive has too many files: {} (max: {})",
39
+ archive.len(),
40
+ limits.max_files_in_archive
41
+ )));
42
+ }
43
+
33
44
  let mut file_list = Vec::with_capacity(archive.len());
34
45
  let mut total_size = 0u64;
35
46
 
@@ -46,6 +57,13 @@ pub fn extract_zip_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
46
57
  total_size += size;
47
58
  }
48
59
 
60
+ if total_size > limits.max_archive_size as u64 {
61
+ return Err(KreuzbergError::validation(format!(
62
+ "ZIP archive total uncompressed size exceeds limit: {} bytes (max: {} bytes)",
63
+ total_size, limits.max_archive_size
64
+ )));
65
+ }
66
+
49
67
  file_list.push(ArchiveEntry { path, size, is_dir });
50
68
  }
51
69
 
@@ -73,13 +91,22 @@ pub fn extract_zip_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
73
91
  /// # Errors
74
92
  ///
75
93
  /// Returns an error if the ZIP archive cannot be read or parsed.
76
- pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
94
+ pub fn extract_zip_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
77
95
  let cursor = Cursor::new(bytes);
78
96
  let mut archive =
79
97
  ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
80
98
 
99
+ if archive.len() > limits.max_files_in_archive {
100
+ return Err(KreuzbergError::validation(format!(
101
+ "ZIP archive has too many files: {} (max: {})",
102
+ archive.len(),
103
+ limits.max_files_in_archive
104
+ )));
105
+ }
106
+
81
107
  let estimated_text_files = archive.len().saturating_mul(3).saturating_div(10).max(2);
82
108
  let mut contents = HashMap::with_capacity(estimated_text_files);
109
+ let mut total_content_size = 0usize;
83
110
 
84
111
  for i in 0..archive.len() {
85
112
  let mut file = archive
@@ -92,6 +119,13 @@ pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
92
119
  let estimated_size = (file.size() as usize).min(10 * 1024 * 1024);
93
120
  let mut content = String::with_capacity(estimated_size);
94
121
  if file.read_to_string(&mut content).is_ok() {
122
+ total_content_size = total_content_size.saturating_add(content.len());
123
+ if total_content_size > limits.max_content_size {
124
+ return Err(KreuzbergError::validation(format!(
125
+ "ZIP archive text content exceeds limit: {} bytes (max: {} bytes)",
126
+ total_content_size, limits.max_content_size
127
+ )));
128
+ }
95
129
  contents.insert(path, content);
96
130
  }
97
131
  }