kreuzberg 4.2.12 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +2 -2
  7. data/vendor/kreuzberg/Cargo.toml +24 -7
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
  10. data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
  11. data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
  12. data/vendor/kreuzberg/src/core/mime.rs +47 -2
  13. data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
  14. data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
  15. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
  16. data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
  17. data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
  18. data/vendor/kreuzberg/src/extraction/image.rs +405 -18
  19. data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
  20. data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
  21. data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
  22. data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
  23. data/vendor/kreuzberg/src/extractors/image.rs +25 -0
  24. data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
  25. data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
  26. data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
  27. data/vendor/kreuzberg/src/extractors/security.rs +2 -1
  28. data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
  29. data/vendor/kreuzberg/src/extractors/text.rs +33 -4
  30. data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
  31. data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
  32. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  33. metadata +4 -2
@@ -1,17 +1,19 @@
1
- //! Archive extractors for ZIP, TAR, and 7z formats.
1
+ //! Archive extractors for ZIP, TAR, 7z, and GZIP formats.
2
2
 
3
3
  use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
5
  use crate::extraction::archive::{
6
- ArchiveMetadata as ExtractedMetadata, extract_7z_metadata, extract_7z_text_content, extract_tar_metadata,
7
- extract_tar_text_content, extract_zip_metadata, extract_zip_text_content,
6
+ ArchiveMetadata as ExtractedMetadata, extract_7z_metadata, extract_7z_text_content, extract_gzip,
7
+ extract_tar_metadata, extract_tar_text_content, extract_zip_metadata, extract_zip_text_content,
8
8
  };
9
+ use crate::extractors::security::ZipBombValidator;
9
10
  use crate::plugins::{DocumentExtractor, Plugin};
10
11
  use crate::types::{ArchiveMetadata, ExtractionResult, Metadata};
11
12
  use ahash::AHashMap;
12
13
  use async_trait::async_trait;
13
14
  use std::borrow::Cow;
14
15
  use std::collections::HashMap;
16
+ use std::io::Cursor;
15
17
 
16
18
  /// Build an ExtractionResult from archive metadata and text contents.
17
19
  ///
@@ -132,7 +134,7 @@ impl Plugin for ZipExtractor {
132
134
  #[async_trait]
133
135
  impl DocumentExtractor for ZipExtractor {
134
136
  #[cfg_attr(feature = "otel", tracing::instrument(
135
- skip(self, content, _config),
137
+ skip(self, content, config),
136
138
  fields(
137
139
  extractor.name = self.name(),
138
140
  content.size_bytes = content.len(),
@@ -142,10 +144,21 @@ impl DocumentExtractor for ZipExtractor {
142
144
  &self,
143
145
  content: &[u8],
144
146
  mime_type: &str,
145
- _config: &ExtractionConfig,
147
+ config: &ExtractionConfig,
146
148
  ) -> Result<ExtractionResult> {
147
- let extraction_metadata = extract_zip_metadata(content)?;
148
- let text_contents = extract_zip_text_content(content)?;
149
+ let limits = config.security_limits.clone().unwrap_or_default();
150
+
151
+ // Validate ZIP archive for bomb attacks before extraction
152
+ let cursor = Cursor::new(content);
153
+ let mut archive = zip::ZipArchive::new(cursor)
154
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
155
+ let validator = ZipBombValidator::new(limits.clone());
156
+ validator
157
+ .validate(&mut archive)
158
+ .map_err(|e| crate::error::KreuzbergError::validation(e.to_string()))?;
159
+
160
+ let extraction_metadata = extract_zip_metadata(content, &limits)?;
161
+ let text_contents = extract_zip_text_content(content, &limits)?;
149
162
  Ok(build_archive_result(
150
163
  extraction_metadata,
151
164
  text_contents,
@@ -210,7 +223,7 @@ impl Plugin for TarExtractor {
210
223
  #[async_trait]
211
224
  impl DocumentExtractor for TarExtractor {
212
225
  #[cfg_attr(feature = "otel", tracing::instrument(
213
- skip(self, content, _config),
226
+ skip(self, content, config),
214
227
  fields(
215
228
  extractor.name = self.name(),
216
229
  content.size_bytes = content.len(),
@@ -220,10 +233,11 @@ impl DocumentExtractor for TarExtractor {
220
233
  &self,
221
234
  content: &[u8],
222
235
  mime_type: &str,
223
- _config: &ExtractionConfig,
236
+ config: &ExtractionConfig,
224
237
  ) -> Result<ExtractionResult> {
225
- let extraction_metadata = extract_tar_metadata(content)?;
226
- let text_contents = extract_tar_text_content(content)?;
238
+ let limits = config.security_limits.clone().unwrap_or_default();
239
+ let extraction_metadata = extract_tar_metadata(content, &limits)?;
240
+ let text_contents = extract_tar_text_content(content, &limits)?;
227
241
  Ok(build_archive_result(
228
242
  extraction_metadata,
229
243
  text_contents,
@@ -293,7 +307,7 @@ impl Plugin for SevenZExtractor {
293
307
  #[async_trait]
294
308
  impl DocumentExtractor for SevenZExtractor {
295
309
  #[cfg_attr(feature = "otel", tracing::instrument(
296
- skip(self, content, _config),
310
+ skip(self, content, config),
297
311
  fields(
298
312
  extractor.name = self.name(),
299
313
  content.size_bytes = content.len(),
@@ -303,10 +317,11 @@ impl DocumentExtractor for SevenZExtractor {
303
317
  &self,
304
318
  content: &[u8],
305
319
  mime_type: &str,
306
- _config: &ExtractionConfig,
320
+ config: &ExtractionConfig,
307
321
  ) -> Result<ExtractionResult> {
308
- let extraction_metadata = extract_7z_metadata(content)?;
309
- let text_contents = extract_7z_text_content(content)?;
322
+ let limits = config.security_limits.clone().unwrap_or_default();
323
+ let extraction_metadata = extract_7z_metadata(content, &limits)?;
324
+ let text_contents = extract_7z_text_content(content, &limits)?;
310
325
  Ok(build_archive_result(
311
326
  extraction_metadata,
312
327
  text_contents,
@@ -324,6 +339,84 @@ impl DocumentExtractor for SevenZExtractor {
324
339
  }
325
340
  }
326
341
 
342
+ /// Gzip archive extractor.
343
+ ///
344
+ /// Decompresses gzip files and extracts text content from the compressed data.
345
+ pub struct GzipExtractor;
346
+
347
+ impl GzipExtractor {
348
+ /// Create a new gzip extractor.
349
+ pub fn new() -> Self {
350
+ Self
351
+ }
352
+ }
353
+
354
+ impl Default for GzipExtractor {
355
+ fn default() -> Self {
356
+ Self::new()
357
+ }
358
+ }
359
+
360
+ impl Plugin for GzipExtractor {
361
+ fn name(&self) -> &str {
362
+ "gzip-extractor"
363
+ }
364
+
365
+ fn version(&self) -> String {
366
+ env!("CARGO_PKG_VERSION").to_string()
367
+ }
368
+
369
+ fn initialize(&self) -> Result<()> {
370
+ Ok(())
371
+ }
372
+
373
+ fn shutdown(&self) -> Result<()> {
374
+ Ok(())
375
+ }
376
+
377
+ fn description(&self) -> &str {
378
+ "Decompresses and extracts text content from gzip-compressed files"
379
+ }
380
+
381
+ fn author(&self) -> &str {
382
+ "Kreuzberg Team"
383
+ }
384
+ }
385
+
386
+ #[async_trait]
387
+ impl DocumentExtractor for GzipExtractor {
388
+ #[cfg_attr(feature = "otel", tracing::instrument(
389
+ skip(self, content, config),
390
+ fields(
391
+ extractor.name = self.name(),
392
+ content.size_bytes = content.len(),
393
+ )
394
+ ))]
395
+ async fn extract_bytes(
396
+ &self,
397
+ content: &[u8],
398
+ mime_type: &str,
399
+ config: &ExtractionConfig,
400
+ ) -> Result<ExtractionResult> {
401
+ let limits = config.security_limits.clone().unwrap_or_default();
402
+ let (extraction_metadata, text_contents) = extract_gzip(content, &limits)?;
403
+ Ok(build_archive_result(
404
+ extraction_metadata,
405
+ text_contents,
406
+ "GZIP",
407
+ mime_type,
408
+ ))
409
+ }
410
+
411
+ fn supported_mime_types(&self) -> &[&str] {
412
+ &["application/gzip", "application/x-gzip"]
413
+ }
414
+
415
+ fn priority(&self) -> i32 {
416
+ 50
417
+ }
418
+ }
419
+
327
420
  #[cfg(test)]
328
421
  mod tests {
329
422
  use super::*;
@@ -448,4 +541,42 @@ mod tests {
448
541
  assert!(extractor.supported_mime_types().contains(&"application/tar"));
449
542
  assert_eq!(extractor.priority(), 50);
450
543
  }
544
+
545
+ #[test]
546
+ fn test_gzip_plugin_interface() {
547
+ let extractor = GzipExtractor::new();
548
+ assert_eq!(extractor.name(), "gzip-extractor");
549
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
550
+ assert!(extractor.supported_mime_types().contains(&"application/gzip"));
551
+ assert!(extractor.supported_mime_types().contains(&"application/x-gzip"));
552
+ assert_eq!(extractor.priority(), 50);
553
+ }
554
+
555
+ #[tokio::test]
556
+ async fn test_gzip_extractor_valid_data() {
557
+ use flate2::Compression;
558
+ use flate2::write::GzEncoder;
559
+ use std::io::Write;
560
+
561
+ let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
562
+ encoder.write_all(b"Hello from gzip extraction!").unwrap();
563
+ let compressed = encoder.finish().unwrap();
564
+
565
+ let extractor = GzipExtractor::new();
566
+ let config = ExtractionConfig::default();
567
+ let result = extractor.extract_bytes(&compressed, "application/gzip", &config).await;
568
+ assert!(result.is_ok());
569
+ let extraction = result.unwrap();
570
+ assert!(extraction.content.contains("Hello from gzip extraction!"));
571
+ }
572
+
573
+ #[tokio::test]
574
+ async fn test_gzip_extractor_invalid_data() {
575
+ let extractor = GzipExtractor::new();
576
+ let config = ExtractionConfig::default();
577
+ let result = extractor
578
+ .extract_bytes(&[0, 1, 2, 3], "application/gzip", &config)
579
+ .await;
580
+ assert!(result.is_err());
581
+ }
451
582
  }
@@ -180,7 +180,7 @@ impl DocumentExtractor for BibtexExtractor {
180
180
  }
181
181
 
182
182
  fn supported_mime_types(&self) -> &[&str] {
183
- &["application/x-bibtex", "text/x-bibtex"]
183
+ &["application/x-bibtex", "text/x-bibtex", "application/x-biblatex"]
184
184
  }
185
185
 
186
186
  fn priority(&self) -> i32 {
@@ -199,7 +199,8 @@ mod tests {
199
199
 
200
200
  assert!(supported.contains(&"application/x-bibtex"));
201
201
  assert!(supported.contains(&"text/x-bibtex"));
202
- assert_eq!(supported.len(), 2);
202
+ assert!(supported.contains(&"application/x-biblatex"));
203
+ assert_eq!(supported.len(), 3);
203
204
  }
204
205
 
205
206
  #[tokio::test]