kreuzberg 4.2.12 → 4.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +24 -7
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
- data/vendor/kreuzberg/src/core/mime.rs +47 -2
- data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
- data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
- data/vendor/kreuzberg/src/extraction/image.rs +405 -18
- data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
- data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
- data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
- data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
- data/vendor/kreuzberg/src/extractors/image.rs +25 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
- data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
- data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
- data/vendor/kreuzberg/src/extractors/security.rs +2 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
- data/vendor/kreuzberg/src/extractors/text.rs +33 -4
- data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -2
|
@@ -5,14 +5,17 @@
|
|
|
5
5
|
//! - ZIP archives
|
|
6
6
|
//! - TAR archives (including compressed TAR.GZ, TAR.BZ2)
|
|
7
7
|
//! - 7Z archives
|
|
8
|
+
//! - GZIP archives
|
|
8
9
|
//!
|
|
9
10
|
//! Each format has its own submodule with specialized extraction logic.
|
|
10
11
|
|
|
12
|
+
mod gzip;
|
|
11
13
|
mod sevenz;
|
|
12
14
|
mod tar;
|
|
13
15
|
mod zip;
|
|
14
16
|
|
|
15
17
|
// Re-export all public functions for backward compatibility
|
|
18
|
+
pub use gzip::{decompress_gzip, extract_gzip, extract_gzip_metadata, extract_gzip_text_content};
|
|
16
19
|
pub use sevenz::{extract_7z_metadata, extract_7z_text_content};
|
|
17
20
|
pub use tar::{extract_tar_metadata, extract_tar_text_content};
|
|
18
21
|
pub use zip::{extract_zip_metadata, extract_zip_text_content};
|
|
@@ -49,10 +52,15 @@ pub(crate) const TEXT_EXTENSIONS: &[&str] = &[
|
|
|
49
52
|
#[cfg(test)]
|
|
50
53
|
mod tests {
|
|
51
54
|
use super::*;
|
|
55
|
+
use crate::extractors::security::SecurityLimits;
|
|
52
56
|
use ::tar::Builder as TarBuilder;
|
|
53
57
|
use ::zip::write::{FileOptions, ZipWriter};
|
|
54
58
|
use std::io::{Cursor, Write};
|
|
55
59
|
|
|
60
|
+
fn default_limits() -> SecurityLimits {
|
|
61
|
+
SecurityLimits::default()
|
|
62
|
+
}
|
|
63
|
+
|
|
56
64
|
#[test]
|
|
57
65
|
fn test_extract_zip_metadata() {
|
|
58
66
|
let mut cursor = Cursor::new(Vec::new());
|
|
@@ -70,7 +78,7 @@ mod tests {
|
|
|
70
78
|
}
|
|
71
79
|
|
|
72
80
|
let bytes = cursor.into_inner();
|
|
73
|
-
let metadata = extract_zip_metadata(&bytes).unwrap();
|
|
81
|
+
let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
|
|
74
82
|
|
|
75
83
|
assert_eq!(metadata.format, "ZIP");
|
|
76
84
|
assert_eq!(metadata.file_count, 2);
|
|
@@ -102,7 +110,7 @@ mod tests {
|
|
|
102
110
|
}
|
|
103
111
|
|
|
104
112
|
let bytes = cursor.into_inner();
|
|
105
|
-
let metadata = extract_tar_metadata(&bytes).unwrap();
|
|
113
|
+
let metadata = extract_tar_metadata(&bytes, &default_limits()).unwrap();
|
|
106
114
|
|
|
107
115
|
assert_eq!(metadata.format, "TAR");
|
|
108
116
|
assert_eq!(metadata.file_count, 2);
|
|
@@ -127,7 +135,7 @@ mod tests {
|
|
|
127
135
|
}
|
|
128
136
|
|
|
129
137
|
let bytes = cursor.into_inner();
|
|
130
|
-
let contents = extract_zip_text_content(&bytes).unwrap();
|
|
138
|
+
let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
|
|
131
139
|
|
|
132
140
|
assert_eq!(contents.len(), 2);
|
|
133
141
|
assert_eq!(contents.get("test.txt").unwrap(), "Hello, World!");
|
|
@@ -158,7 +166,7 @@ mod tests {
|
|
|
158
166
|
}
|
|
159
167
|
|
|
160
168
|
let bytes = cursor.into_inner();
|
|
161
|
-
let contents = extract_tar_text_content(&bytes).unwrap();
|
|
169
|
+
let contents = extract_tar_text_content(&bytes, &default_limits()).unwrap();
|
|
162
170
|
|
|
163
171
|
assert_eq!(contents.len(), 2);
|
|
164
172
|
assert_eq!(contents.get("test.txt").unwrap(), "Hello, World!");
|
|
@@ -168,14 +176,14 @@ mod tests {
|
|
|
168
176
|
#[test]
|
|
169
177
|
fn test_extract_zip_metadata_invalid() {
|
|
170
178
|
let invalid_bytes = vec![0, 1, 2, 3, 4, 5];
|
|
171
|
-
let result = extract_zip_metadata(&invalid_bytes);
|
|
179
|
+
let result = extract_zip_metadata(&invalid_bytes, &default_limits());
|
|
172
180
|
assert!(result.is_err());
|
|
173
181
|
}
|
|
174
182
|
|
|
175
183
|
#[test]
|
|
176
184
|
fn test_extract_tar_metadata_invalid() {
|
|
177
185
|
let invalid_bytes = vec![0, 1, 2, 3, 4, 5];
|
|
178
|
-
let result = extract_tar_metadata(&invalid_bytes);
|
|
186
|
+
let result = extract_tar_metadata(&invalid_bytes, &default_limits());
|
|
179
187
|
assert!(result.is_err());
|
|
180
188
|
}
|
|
181
189
|
|
|
@@ -199,7 +207,7 @@ mod tests {
|
|
|
199
207
|
}
|
|
200
208
|
|
|
201
209
|
let bytes = cursor.into_inner();
|
|
202
|
-
let metadata = extract_zip_metadata(&bytes).unwrap();
|
|
210
|
+
let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
|
|
203
211
|
|
|
204
212
|
assert_eq!(metadata.format, "ZIP");
|
|
205
213
|
assert_eq!(metadata.file_count, 4);
|
|
@@ -233,7 +241,7 @@ mod tests {
|
|
|
233
241
|
}
|
|
234
242
|
|
|
235
243
|
let bytes = cursor.into_inner();
|
|
236
|
-
let metadata = extract_tar_metadata(&bytes).unwrap();
|
|
244
|
+
let metadata = extract_tar_metadata(&bytes, &default_limits()).unwrap();
|
|
237
245
|
|
|
238
246
|
assert_eq!(metadata.format, "TAR");
|
|
239
247
|
assert_eq!(metadata.file_count, 2);
|
|
@@ -258,7 +266,7 @@ mod tests {
|
|
|
258
266
|
tar.finish().unwrap();
|
|
259
267
|
}
|
|
260
268
|
|
|
261
|
-
let metadata = extract_tar_metadata(&tar_data).unwrap();
|
|
269
|
+
let metadata = extract_tar_metadata(&tar_data, &default_limits()).unwrap();
|
|
262
270
|
assert_eq!(metadata.format, "TAR");
|
|
263
271
|
assert_eq!(metadata.file_count, 1);
|
|
264
272
|
assert_eq!(metadata.file_list[0].path, "test.txt");
|
|
@@ -288,7 +296,7 @@ mod tests {
|
|
|
288
296
|
};
|
|
289
297
|
|
|
290
298
|
let bytes = cursor.into_inner();
|
|
291
|
-
let metadata = extract_7z_metadata(&bytes).unwrap();
|
|
299
|
+
let metadata = extract_7z_metadata(&bytes, &default_limits()).unwrap();
|
|
292
300
|
|
|
293
301
|
assert_eq!(metadata.format, "7Z");
|
|
294
302
|
assert_eq!(metadata.file_count, 2);
|
|
@@ -324,7 +332,7 @@ mod tests {
|
|
|
324
332
|
}
|
|
325
333
|
|
|
326
334
|
let outer_bytes = outer_cursor.into_inner();
|
|
327
|
-
let metadata = extract_zip_metadata(&outer_bytes).unwrap();
|
|
335
|
+
let metadata = extract_zip_metadata(&outer_bytes, &default_limits()).unwrap();
|
|
328
336
|
|
|
329
337
|
assert_eq!(metadata.file_count, 2);
|
|
330
338
|
|
|
@@ -371,7 +379,7 @@ mod tests {
|
|
|
371
379
|
}
|
|
372
380
|
|
|
373
381
|
let outer_bytes = outer_cursor.into_inner();
|
|
374
|
-
let metadata = extract_tar_metadata(&outer_bytes).unwrap();
|
|
382
|
+
let metadata = extract_tar_metadata(&outer_bytes, &default_limits()).unwrap();
|
|
375
383
|
|
|
376
384
|
assert_eq!(metadata.file_count, 2);
|
|
377
385
|
|
|
@@ -397,7 +405,7 @@ mod tests {
|
|
|
397
405
|
let mut corrupted = valid_cursor.into_inner();
|
|
398
406
|
corrupted.truncate(corrupted.len() / 2);
|
|
399
407
|
|
|
400
|
-
let result = extract_zip_metadata(&corrupted);
|
|
408
|
+
let result = extract_zip_metadata(&corrupted, &default_limits());
|
|
401
409
|
assert!(result.is_err());
|
|
402
410
|
|
|
403
411
|
if let Err(e) = result {
|
|
@@ -424,7 +432,7 @@ mod tests {
|
|
|
424
432
|
let mut corrupted = valid_cursor.into_inner();
|
|
425
433
|
corrupted[100] = 0xFF;
|
|
426
434
|
|
|
427
|
-
let result = extract_tar_metadata(&corrupted);
|
|
435
|
+
let result = extract_tar_metadata(&corrupted, &default_limits());
|
|
428
436
|
assert!(result.is_err());
|
|
429
437
|
}
|
|
430
438
|
|
|
@@ -437,7 +445,7 @@ mod tests {
|
|
|
437
445
|
}
|
|
438
446
|
|
|
439
447
|
let bytes = cursor.into_inner();
|
|
440
|
-
let metadata = extract_zip_metadata(&bytes).unwrap();
|
|
448
|
+
let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
|
|
441
449
|
|
|
442
450
|
assert_eq!(metadata.format, "ZIP");
|
|
443
451
|
assert_eq!(metadata.file_count, 0);
|
|
@@ -454,7 +462,7 @@ mod tests {
|
|
|
454
462
|
}
|
|
455
463
|
|
|
456
464
|
let bytes = cursor.into_inner();
|
|
457
|
-
let metadata = extract_tar_metadata(&bytes).unwrap();
|
|
465
|
+
let metadata = extract_tar_metadata(&bytes, &default_limits()).unwrap();
|
|
458
466
|
|
|
459
467
|
assert_eq!(metadata.format, "TAR");
|
|
460
468
|
assert_eq!(metadata.file_count, 0);
|
|
@@ -485,7 +493,7 @@ mod tests {
|
|
|
485
493
|
}
|
|
486
494
|
|
|
487
495
|
let bytes = cursor.into_inner();
|
|
488
|
-
let contents = extract_zip_text_content(&bytes).unwrap();
|
|
496
|
+
let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
|
|
489
497
|
|
|
490
498
|
assert_eq!(contents.len(), 3);
|
|
491
499
|
assert_eq!(contents.get("file1.txt").unwrap(), "Content 1");
|
|
@@ -519,7 +527,7 @@ mod tests {
|
|
|
519
527
|
}
|
|
520
528
|
|
|
521
529
|
let bytes = cursor.into_inner();
|
|
522
|
-
let contents = extract_tar_text_content(&bytes).unwrap();
|
|
530
|
+
let contents = extract_tar_text_content(&bytes, &default_limits()).unwrap();
|
|
523
531
|
|
|
524
532
|
assert_eq!(contents.len(), 4);
|
|
525
533
|
assert_eq!(contents.get("file1.txt").unwrap(), "Content 1");
|
|
@@ -552,7 +560,7 @@ mod tests {
|
|
|
552
560
|
}
|
|
553
561
|
|
|
554
562
|
let bytes = cursor.into_inner();
|
|
555
|
-
let metadata = extract_zip_metadata(&bytes).unwrap();
|
|
563
|
+
let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
|
|
556
564
|
|
|
557
565
|
let paths: Vec<&str> = metadata.file_list.iter().map(|e| e.path.as_str()).collect();
|
|
558
566
|
assert!(paths.contains(&"root/"));
|
|
@@ -579,12 +587,12 @@ mod tests {
|
|
|
579
587
|
}
|
|
580
588
|
|
|
581
589
|
let bytes = cursor.into_inner();
|
|
582
|
-
let metadata = extract_zip_metadata(&bytes).unwrap();
|
|
590
|
+
let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
|
|
583
591
|
|
|
584
592
|
assert_eq!(metadata.file_count, 1);
|
|
585
593
|
assert_eq!(metadata.total_size, 10_000);
|
|
586
594
|
|
|
587
|
-
let contents = extract_zip_text_content(&bytes).unwrap();
|
|
595
|
+
let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
|
|
588
596
|
assert_eq!(contents.get("large.txt").unwrap().len(), 10_000);
|
|
589
597
|
}
|
|
590
598
|
|
|
@@ -607,12 +615,12 @@ mod tests {
|
|
|
607
615
|
}
|
|
608
616
|
|
|
609
617
|
let bytes = cursor.into_inner();
|
|
610
|
-
let metadata = extract_zip_metadata(&bytes).unwrap();
|
|
618
|
+
let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
|
|
611
619
|
|
|
612
620
|
assert_eq!(metadata.file_count, 100);
|
|
613
621
|
assert_eq!(metadata.file_list.len(), 100);
|
|
614
622
|
|
|
615
|
-
let contents = extract_zip_text_content(&bytes).unwrap();
|
|
623
|
+
let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
|
|
616
624
|
assert_eq!(contents.len(), 100);
|
|
617
625
|
}
|
|
618
626
|
|
|
@@ -632,12 +640,12 @@ mod tests {
|
|
|
632
640
|
}
|
|
633
641
|
|
|
634
642
|
let bytes = cursor.into_inner();
|
|
635
|
-
let metadata = extract_zip_metadata(&bytes).unwrap();
|
|
643
|
+
let metadata = extract_zip_metadata(&bytes, &default_limits()).unwrap();
|
|
636
644
|
|
|
637
645
|
assert_eq!(metadata.file_count, 1);
|
|
638
646
|
assert!(metadata.file_list[0].path.len() > 200);
|
|
639
647
|
|
|
640
|
-
let contents = extract_zip_text_content(&bytes).unwrap();
|
|
648
|
+
let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
|
|
641
649
|
assert_eq!(contents.len(), 1);
|
|
642
650
|
}
|
|
643
651
|
|
|
@@ -665,7 +673,7 @@ mod tests {
|
|
|
665
673
|
};
|
|
666
674
|
|
|
667
675
|
let bytes = cursor.into_inner();
|
|
668
|
-
let contents = extract_7z_text_content(&bytes).unwrap();
|
|
676
|
+
let contents = extract_7z_text_content(&bytes, &default_limits()).unwrap();
|
|
669
677
|
|
|
670
678
|
assert_eq!(contents.len(), 2);
|
|
671
679
|
assert_eq!(contents.get("test.txt").unwrap(), "Hello 7z text!");
|
|
@@ -683,7 +691,7 @@ mod tests {
|
|
|
683
691
|
};
|
|
684
692
|
|
|
685
693
|
let bytes = cursor.into_inner();
|
|
686
|
-
let metadata = extract_7z_metadata(&bytes).unwrap();
|
|
694
|
+
let metadata = extract_7z_metadata(&bytes, &default_limits()).unwrap();
|
|
687
695
|
|
|
688
696
|
assert_eq!(metadata.format, "7Z");
|
|
689
697
|
assert_eq!(metadata.file_count, 0);
|
|
@@ -708,12 +716,12 @@ mod tests {
|
|
|
708
716
|
}
|
|
709
717
|
|
|
710
718
|
let bytes = cursor.into_inner();
|
|
711
|
-
let metadata = extract_tar_metadata(&bytes).unwrap();
|
|
719
|
+
let metadata = extract_tar_metadata(&bytes, &default_limits()).unwrap();
|
|
712
720
|
|
|
713
721
|
assert_eq!(metadata.file_count, 1);
|
|
714
722
|
assert_eq!(metadata.total_size, 50_000);
|
|
715
723
|
|
|
716
|
-
let contents = extract_tar_text_content(&bytes).unwrap();
|
|
724
|
+
let contents = extract_tar_text_content(&bytes, &default_limits()).unwrap();
|
|
717
725
|
assert_eq!(contents.get("large.txt").unwrap().len(), 50_000);
|
|
718
726
|
}
|
|
719
727
|
|
|
@@ -740,7 +748,7 @@ mod tests {
|
|
|
740
748
|
}
|
|
741
749
|
|
|
742
750
|
let bytes = cursor.into_inner();
|
|
743
|
-
let contents = extract_zip_text_content(&bytes).unwrap();
|
|
751
|
+
let contents = extract_zip_text_content(&bytes, &default_limits()).unwrap();
|
|
744
752
|
|
|
745
753
|
assert_eq!(contents.len(), 2);
|
|
746
754
|
assert!(contents.contains_key("document.txt"));
|
|
@@ -755,11 +763,119 @@ mod tests {
|
|
|
755
763
|
|
|
756
764
|
let invalid_7z_data = vec![0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C, 0x00];
|
|
757
765
|
|
|
758
|
-
let result = extract_7z_metadata(&invalid_7z_data);
|
|
766
|
+
let result = extract_7z_metadata(&invalid_7z_data, &default_limits());
|
|
759
767
|
assert!(result.is_err());
|
|
760
768
|
|
|
761
769
|
if let Err(e) = result {
|
|
762
770
|
assert!(matches!(e, KreuzbergError::Parsing { .. }));
|
|
763
771
|
}
|
|
764
772
|
}
|
|
773
|
+
|
|
774
|
+
#[test]
|
|
775
|
+
fn test_extract_gzip_metadata() {
|
|
776
|
+
use flate2::Compression;
|
|
777
|
+
use flate2::write::GzEncoder;
|
|
778
|
+
use std::io::Write;
|
|
779
|
+
|
|
780
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
781
|
+
encoder.write_all(b"Hello from gzip!").unwrap();
|
|
782
|
+
let compressed = encoder.finish().unwrap();
|
|
783
|
+
|
|
784
|
+
let metadata = extract_gzip_metadata(&compressed, &default_limits()).unwrap();
|
|
785
|
+
assert_eq!(metadata.format, "GZIP");
|
|
786
|
+
assert_eq!(metadata.file_count, 1);
|
|
787
|
+
assert_eq!(metadata.total_size, 16);
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
#[test]
|
|
791
|
+
fn test_extract_gzip_text_content() {
|
|
792
|
+
use flate2::Compression;
|
|
793
|
+
use flate2::write::GzEncoder;
|
|
794
|
+
use std::io::Write;
|
|
795
|
+
|
|
796
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
797
|
+
encoder.write_all(b"Hello from gzip!").unwrap();
|
|
798
|
+
let compressed = encoder.finish().unwrap();
|
|
799
|
+
|
|
800
|
+
let contents = extract_gzip_text_content(&compressed, &default_limits()).unwrap();
|
|
801
|
+
assert_eq!(contents.len(), 1);
|
|
802
|
+
assert!(contents.values().next().unwrap().contains("Hello from gzip!"));
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
#[test]
|
|
806
|
+
fn test_decompress_gzip() {
|
|
807
|
+
use flate2::Compression;
|
|
808
|
+
use flate2::write::GzEncoder;
|
|
809
|
+
use std::io::Write;
|
|
810
|
+
|
|
811
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
812
|
+
encoder.write_all(b"test content").unwrap();
|
|
813
|
+
let compressed = encoder.finish().unwrap();
|
|
814
|
+
|
|
815
|
+
let decompressed = decompress_gzip(&compressed, &default_limits()).unwrap();
|
|
816
|
+
assert_eq!(String::from_utf8(decompressed).unwrap(), "test content");
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
#[test]
|
|
820
|
+
fn test_extract_gzip_invalid_data() {
|
|
821
|
+
let invalid = vec![0, 1, 2, 3, 4, 5];
|
|
822
|
+
let result = extract_gzip_metadata(&invalid, &default_limits());
|
|
823
|
+
assert!(result.is_err());
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
#[test]
|
|
827
|
+
fn test_extract_gzip_empty_content() {
|
|
828
|
+
use flate2::Compression;
|
|
829
|
+
use flate2::write::GzEncoder;
|
|
830
|
+
|
|
831
|
+
let encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
832
|
+
let compressed = encoder.finish().unwrap();
|
|
833
|
+
|
|
834
|
+
let metadata = extract_gzip_metadata(&compressed, &default_limits()).unwrap();
|
|
835
|
+
assert_eq!(metadata.format, "GZIP");
|
|
836
|
+
assert_eq!(metadata.total_size, 0);
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
#[test]
|
|
840
|
+
fn test_zip_too_many_files_rejected() {
|
|
841
|
+
let mut cursor = Cursor::new(Vec::new());
|
|
842
|
+
{
|
|
843
|
+
let mut zip = ZipWriter::new(&mut cursor);
|
|
844
|
+
let options = FileOptions::<'_, ()>::default();
|
|
845
|
+
|
|
846
|
+
for i in 0..5 {
|
|
847
|
+
let filename = format!("file_{}.txt", i);
|
|
848
|
+
zip.start_file(&filename, options).unwrap();
|
|
849
|
+
zip.write_all(b"content").unwrap();
|
|
850
|
+
}
|
|
851
|
+
zip.finish().unwrap();
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
let bytes = cursor.into_inner();
|
|
855
|
+
let limits = SecurityLimits {
|
|
856
|
+
max_files_in_archive: 3,
|
|
857
|
+
..SecurityLimits::default()
|
|
858
|
+
};
|
|
859
|
+
let result = extract_zip_metadata(&bytes, &limits);
|
|
860
|
+
assert!(result.is_err());
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
#[test]
|
|
864
|
+
fn test_gzip_bomb_rejected() {
|
|
865
|
+
use flate2::Compression;
|
|
866
|
+
use flate2::write::GzEncoder;
|
|
867
|
+
use std::io::Write;
|
|
868
|
+
|
|
869
|
+
// Create data that exceeds a tiny limit
|
|
870
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
871
|
+
encoder.write_all(&[b'A'; 1024]).unwrap();
|
|
872
|
+
let compressed = encoder.finish().unwrap();
|
|
873
|
+
|
|
874
|
+
let limits = SecurityLimits {
|
|
875
|
+
max_archive_size: 100, // 100 bytes limit
|
|
876
|
+
..SecurityLimits::default()
|
|
877
|
+
};
|
|
878
|
+
let result = extract_gzip_metadata(&compressed, &limits);
|
|
879
|
+
assert!(result.is_err());
|
|
880
|
+
}
|
|
765
881
|
}
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
|
|
6
6
|
use crate::error::{KreuzbergError, Result};
|
|
7
|
+
use crate::extractors::security::SecurityLimits;
|
|
7
8
|
use sevenz_rust2::{ArchiveReader, Password};
|
|
8
9
|
use std::collections::HashMap;
|
|
9
10
|
use std::io::Cursor;
|
|
@@ -13,6 +14,7 @@ use std::io::Cursor;
|
|
|
13
14
|
/// # Arguments
|
|
14
15
|
///
|
|
15
16
|
/// * `bytes` - The 7z archive bytes
|
|
17
|
+
/// * `limits` - Security limits for archive extraction
|
|
16
18
|
///
|
|
17
19
|
/// # Returns
|
|
18
20
|
///
|
|
@@ -24,8 +26,9 @@ use std::io::Cursor;
|
|
|
24
26
|
///
|
|
25
27
|
/// # Errors
|
|
26
28
|
///
|
|
27
|
-
/// Returns an error if the 7z archive cannot be read or parsed
|
|
28
|
-
|
|
29
|
+
/// Returns an error if the 7z archive cannot be read or parsed,
|
|
30
|
+
/// or if security limits are exceeded.
|
|
31
|
+
pub fn extract_7z_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
|
|
29
32
|
let cursor = Cursor::new(bytes);
|
|
30
33
|
let archive = ArchiveReader::new(cursor, Password::empty())
|
|
31
34
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
|
|
@@ -33,7 +36,16 @@ pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
33
36
|
let mut file_list = Vec::new();
|
|
34
37
|
let mut total_size = 0u64;
|
|
35
38
|
|
|
36
|
-
|
|
39
|
+
let files = &archive.archive().files;
|
|
40
|
+
if files.len() > limits.max_files_in_archive {
|
|
41
|
+
return Err(KreuzbergError::validation(format!(
|
|
42
|
+
"7z archive has too many files: {} (max: {})",
|
|
43
|
+
files.len(),
|
|
44
|
+
limits.max_files_in_archive
|
|
45
|
+
)));
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
for entry in files {
|
|
37
49
|
let path = entry.name().to_string();
|
|
38
50
|
let size = entry.size();
|
|
39
51
|
let is_dir = entry.is_directory();
|
|
@@ -42,6 +54,13 @@ pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
42
54
|
total_size += size;
|
|
43
55
|
}
|
|
44
56
|
|
|
57
|
+
if total_size > limits.max_archive_size as u64 {
|
|
58
|
+
return Err(KreuzbergError::validation(format!(
|
|
59
|
+
"7z archive total uncompressed size exceeds limit: {} bytes (max: {} bytes)",
|
|
60
|
+
total_size, limits.max_archive_size
|
|
61
|
+
)));
|
|
62
|
+
}
|
|
63
|
+
|
|
45
64
|
file_list.push(ArchiveEntry { path, size, is_dir });
|
|
46
65
|
}
|
|
47
66
|
|
|
@@ -71,12 +90,22 @@ pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
71
90
|
/// # Errors
|
|
72
91
|
///
|
|
73
92
|
/// Returns an error if the 7z archive cannot be read or parsed.
|
|
74
|
-
pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
93
|
+
pub fn extract_7z_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
|
|
75
94
|
let cursor = Cursor::new(bytes);
|
|
76
95
|
let mut archive = ArchiveReader::new(cursor, Password::empty())
|
|
77
96
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
|
|
78
97
|
|
|
98
|
+
let file_count = archive.archive().files.len();
|
|
99
|
+
if file_count > limits.max_files_in_archive {
|
|
100
|
+
return Err(KreuzbergError::validation(format!(
|
|
101
|
+
"7z archive has too many files: {} (max: {})",
|
|
102
|
+
file_count, limits.max_files_in_archive
|
|
103
|
+
)));
|
|
104
|
+
}
|
|
105
|
+
|
|
79
106
|
let mut contents = HashMap::new();
|
|
107
|
+
let max_content_size = limits.max_content_size;
|
|
108
|
+
let mut total_content_size = 0usize;
|
|
80
109
|
|
|
81
110
|
archive
|
|
82
111
|
.for_each_entries(|entry, reader| {
|
|
@@ -87,6 +116,10 @@ pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
87
116
|
if let Ok(_) = reader.read_to_end(&mut content)
|
|
88
117
|
&& let Ok(text) = String::from_utf8(content)
|
|
89
118
|
{
|
|
119
|
+
total_content_size = total_content_size.saturating_add(text.len());
|
|
120
|
+
if total_content_size > max_content_size {
|
|
121
|
+
return Ok(false);
|
|
122
|
+
}
|
|
90
123
|
contents.insert(path, text);
|
|
91
124
|
}
|
|
92
125
|
}
|
|
@@ -94,5 +127,12 @@ pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
94
127
|
})
|
|
95
128
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z entries: {}", e)))?;
|
|
96
129
|
|
|
130
|
+
if total_content_size > max_content_size {
|
|
131
|
+
return Err(KreuzbergError::validation(format!(
|
|
132
|
+
"7z archive text content exceeds limit: {} bytes (max: {} bytes)",
|
|
133
|
+
total_content_size, max_content_size
|
|
134
|
+
)));
|
|
135
|
+
}
|
|
136
|
+
|
|
97
137
|
Ok(contents)
|
|
98
138
|
}
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
|
|
7
7
|
use crate::error::{KreuzbergError, Result};
|
|
8
|
+
use crate::extractors::security::SecurityLimits;
|
|
8
9
|
use std::collections::HashMap;
|
|
9
10
|
use std::io::{Cursor, Read};
|
|
10
11
|
use tar::Archive as TarArchive;
|
|
@@ -14,6 +15,7 @@ use tar::Archive as TarArchive;
|
|
|
14
15
|
/// # Arguments
|
|
15
16
|
///
|
|
16
17
|
/// * `bytes` - The TAR archive bytes (can be compressed with gzip or bzip2)
|
|
18
|
+
/// * `limits` - Security limits for archive extraction
|
|
17
19
|
///
|
|
18
20
|
/// # Returns
|
|
19
21
|
///
|
|
@@ -25,8 +27,9 @@ use tar::Archive as TarArchive;
|
|
|
25
27
|
///
|
|
26
28
|
/// # Errors
|
|
27
29
|
///
|
|
28
|
-
/// Returns an error if the TAR archive cannot be read or parsed
|
|
29
|
-
|
|
30
|
+
/// Returns an error if the TAR archive cannot be read or parsed,
|
|
31
|
+
/// or if security limits are exceeded.
|
|
32
|
+
pub fn extract_tar_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
|
|
30
33
|
let cursor = Cursor::new(bytes);
|
|
31
34
|
let mut archive = TarArchive::new(cursor);
|
|
32
35
|
|
|
@@ -56,6 +59,21 @@ pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
56
59
|
}
|
|
57
60
|
|
|
58
61
|
file_count += 1;
|
|
62
|
+
|
|
63
|
+
if file_count > limits.max_files_in_archive {
|
|
64
|
+
return Err(KreuzbergError::validation(format!(
|
|
65
|
+
"TAR archive has too many files: {} (max: {})",
|
|
66
|
+
file_count, limits.max_files_in_archive
|
|
67
|
+
)));
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if total_size > limits.max_archive_size as u64 {
|
|
71
|
+
return Err(KreuzbergError::validation(format!(
|
|
72
|
+
"TAR archive total uncompressed size exceeds limit: {} bytes (max: {} bytes)",
|
|
73
|
+
total_size, limits.max_archive_size
|
|
74
|
+
)));
|
|
75
|
+
}
|
|
76
|
+
|
|
59
77
|
file_list.push(ArchiveEntry { path, size, is_dir });
|
|
60
78
|
}
|
|
61
79
|
|
|
@@ -83,12 +101,14 @@ pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
83
101
|
/// # Errors
|
|
84
102
|
///
|
|
85
103
|
/// Returns an error if the TAR archive cannot be read or parsed.
|
|
86
|
-
pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
104
|
+
pub fn extract_tar_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
|
|
87
105
|
let cursor = Cursor::new(bytes);
|
|
88
106
|
let mut archive = TarArchive::new(cursor);
|
|
89
107
|
|
|
90
108
|
let estimated_text_files = bytes.len().saturating_div(1024 * 10).min(100);
|
|
91
109
|
let mut contents = HashMap::with_capacity(estimated_text_files.max(2));
|
|
110
|
+
let mut file_count = 0usize;
|
|
111
|
+
let mut total_content_size = 0usize;
|
|
92
112
|
|
|
93
113
|
let entries = archive
|
|
94
114
|
.entries()
|
|
@@ -98,6 +118,14 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
98
118
|
let mut entry =
|
|
99
119
|
entry_result.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry: {}", e)))?;
|
|
100
120
|
|
|
121
|
+
file_count += 1;
|
|
122
|
+
if file_count > limits.max_files_in_archive {
|
|
123
|
+
return Err(KreuzbergError::validation(format!(
|
|
124
|
+
"TAR archive has too many files: {} (max: {})",
|
|
125
|
+
file_count, limits.max_files_in_archive
|
|
126
|
+
)));
|
|
127
|
+
}
|
|
128
|
+
|
|
101
129
|
let path = entry
|
|
102
130
|
.path()
|
|
103
131
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry path: {}", e)))?
|
|
@@ -109,6 +137,13 @@ pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
109
137
|
let estimated_size = (entry.size().min(10 * 1024 * 1024)) as usize;
|
|
110
138
|
let mut content = String::with_capacity(estimated_size);
|
|
111
139
|
if entry.read_to_string(&mut content).is_ok() {
|
|
140
|
+
total_content_size = total_content_size.saturating_add(content.len());
|
|
141
|
+
if total_content_size > limits.max_content_size {
|
|
142
|
+
return Err(KreuzbergError::validation(format!(
|
|
143
|
+
"TAR archive text content exceeds limit: {} bytes (max: {} bytes)",
|
|
144
|
+
total_content_size, limits.max_content_size
|
|
145
|
+
)));
|
|
146
|
+
}
|
|
112
147
|
contents.insert(path, content);
|
|
113
148
|
}
|
|
114
149
|
}
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
use super::{ArchiveEntry, ArchiveMetadata, TEXT_EXTENSIONS};
|
|
6
6
|
use crate::error::{KreuzbergError, Result};
|
|
7
|
+
use crate::extractors::security::SecurityLimits;
|
|
7
8
|
use std::collections::HashMap;
|
|
8
9
|
use std::io::{Cursor, Read};
|
|
9
10
|
use zip::ZipArchive;
|
|
@@ -13,6 +14,7 @@ use zip::ZipArchive;
|
|
|
13
14
|
/// # Arguments
|
|
14
15
|
///
|
|
15
16
|
/// * `bytes` - The ZIP archive bytes
|
|
17
|
+
/// * `limits` - Security limits for archive extraction
|
|
16
18
|
///
|
|
17
19
|
/// # Returns
|
|
18
20
|
///
|
|
@@ -24,12 +26,21 @@ use zip::ZipArchive;
|
|
|
24
26
|
///
|
|
25
27
|
/// # Errors
|
|
26
28
|
///
|
|
27
|
-
/// Returns an error if the ZIP archive cannot be read or parsed
|
|
28
|
-
|
|
29
|
+
/// Returns an error if the ZIP archive cannot be read or parsed,
|
|
30
|
+
/// or if security limits are exceeded.
|
|
31
|
+
pub fn extract_zip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
|
|
29
32
|
let cursor = Cursor::new(bytes);
|
|
30
33
|
let mut archive =
|
|
31
34
|
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
|
|
32
35
|
|
|
36
|
+
if archive.len() > limits.max_files_in_archive {
|
|
37
|
+
return Err(KreuzbergError::validation(format!(
|
|
38
|
+
"ZIP archive has too many files: {} (max: {})",
|
|
39
|
+
archive.len(),
|
|
40
|
+
limits.max_files_in_archive
|
|
41
|
+
)));
|
|
42
|
+
}
|
|
43
|
+
|
|
33
44
|
let mut file_list = Vec::with_capacity(archive.len());
|
|
34
45
|
let mut total_size = 0u64;
|
|
35
46
|
|
|
@@ -46,6 +57,13 @@ pub fn extract_zip_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
46
57
|
total_size += size;
|
|
47
58
|
}
|
|
48
59
|
|
|
60
|
+
if total_size > limits.max_archive_size as u64 {
|
|
61
|
+
return Err(KreuzbergError::validation(format!(
|
|
62
|
+
"ZIP archive total uncompressed size exceeds limit: {} bytes (max: {} bytes)",
|
|
63
|
+
total_size, limits.max_archive_size
|
|
64
|
+
)));
|
|
65
|
+
}
|
|
66
|
+
|
|
49
67
|
file_list.push(ArchiveEntry { path, size, is_dir });
|
|
50
68
|
}
|
|
51
69
|
|
|
@@ -73,13 +91,22 @@ pub fn extract_zip_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
|
|
|
73
91
|
/// # Errors
|
|
74
92
|
///
|
|
75
93
|
/// Returns an error if the ZIP archive cannot be read or parsed.
|
|
76
|
-
pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
|
|
94
|
+
pub fn extract_zip_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
|
|
77
95
|
let cursor = Cursor::new(bytes);
|
|
78
96
|
let mut archive =
|
|
79
97
|
ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
|
|
80
98
|
|
|
99
|
+
if archive.len() > limits.max_files_in_archive {
|
|
100
|
+
return Err(KreuzbergError::validation(format!(
|
|
101
|
+
"ZIP archive has too many files: {} (max: {})",
|
|
102
|
+
archive.len(),
|
|
103
|
+
limits.max_files_in_archive
|
|
104
|
+
)));
|
|
105
|
+
}
|
|
106
|
+
|
|
81
107
|
let estimated_text_files = archive.len().saturating_mul(3).saturating_div(10).max(2);
|
|
82
108
|
let mut contents = HashMap::with_capacity(estimated_text_files);
|
|
109
|
+
let mut total_content_size = 0usize;
|
|
83
110
|
|
|
84
111
|
for i in 0..archive.len() {
|
|
85
112
|
let mut file = archive
|
|
@@ -92,6 +119,13 @@ pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>>
|
|
|
92
119
|
let estimated_size = (file.size() as usize).min(10 * 1024 * 1024);
|
|
93
120
|
let mut content = String::with_capacity(estimated_size);
|
|
94
121
|
if file.read_to_string(&mut content).is_ok() {
|
|
122
|
+
total_content_size = total_content_size.saturating_add(content.len());
|
|
123
|
+
if total_content_size > limits.max_content_size {
|
|
124
|
+
return Err(KreuzbergError::validation(format!(
|
|
125
|
+
"ZIP archive text content exceeds limit: {} bytes (max: {} bytes)",
|
|
126
|
+
total_content_size, limits.max_content_size
|
|
127
|
+
)));
|
|
128
|
+
}
|
|
95
129
|
contents.insert(path, content);
|
|
96
130
|
}
|
|
97
131
|
}
|