kreuzberg 4.2.12 → 4.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +24 -7
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
- data/vendor/kreuzberg/src/core/mime.rs +47 -2
- data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
- data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
- data/vendor/kreuzberg/src/extraction/image.rs +405 -18
- data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
- data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
- data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
- data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
- data/vendor/kreuzberg/src/extractors/image.rs +25 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
- data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
- data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
- data/vendor/kreuzberg/src/extractors/security.rs +2 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
- data/vendor/kreuzberg/src/extractors/text.rs +33 -4
- data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -2
|
@@ -1,17 +1,19 @@
|
|
|
1
|
-
//! Archive extractors for ZIP, TAR, and
|
|
1
|
+
//! Archive extractors for ZIP, TAR, 7z, and GZIP formats.
|
|
2
2
|
|
|
3
3
|
use crate::Result;
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
5
|
use crate::extraction::archive::{
|
|
6
|
-
ArchiveMetadata as ExtractedMetadata, extract_7z_metadata, extract_7z_text_content,
|
|
7
|
-
extract_tar_text_content, extract_zip_metadata, extract_zip_text_content,
|
|
6
|
+
ArchiveMetadata as ExtractedMetadata, extract_7z_metadata, extract_7z_text_content, extract_gzip,
|
|
7
|
+
extract_tar_metadata, extract_tar_text_content, extract_zip_metadata, extract_zip_text_content,
|
|
8
8
|
};
|
|
9
|
+
use crate::extractors::security::ZipBombValidator;
|
|
9
10
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
10
11
|
use crate::types::{ArchiveMetadata, ExtractionResult, Metadata};
|
|
11
12
|
use ahash::AHashMap;
|
|
12
13
|
use async_trait::async_trait;
|
|
13
14
|
use std::borrow::Cow;
|
|
14
15
|
use std::collections::HashMap;
|
|
16
|
+
use std::io::Cursor;
|
|
15
17
|
|
|
16
18
|
/// Build an ExtractionResult from archive metadata and text contents.
|
|
17
19
|
///
|
|
@@ -132,7 +134,7 @@ impl Plugin for ZipExtractor {
|
|
|
132
134
|
#[async_trait]
|
|
133
135
|
impl DocumentExtractor for ZipExtractor {
|
|
134
136
|
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
135
|
-
skip(self, content,
|
|
137
|
+
skip(self, content, config),
|
|
136
138
|
fields(
|
|
137
139
|
extractor.name = self.name(),
|
|
138
140
|
content.size_bytes = content.len(),
|
|
@@ -142,10 +144,21 @@ impl DocumentExtractor for ZipExtractor {
|
|
|
142
144
|
&self,
|
|
143
145
|
content: &[u8],
|
|
144
146
|
mime_type: &str,
|
|
145
|
-
|
|
147
|
+
config: &ExtractionConfig,
|
|
146
148
|
) -> Result<ExtractionResult> {
|
|
147
|
-
let
|
|
148
|
-
|
|
149
|
+
let limits = config.security_limits.clone().unwrap_or_default();
|
|
150
|
+
|
|
151
|
+
// Validate ZIP archive for bomb attacks before extraction
|
|
152
|
+
let cursor = Cursor::new(content);
|
|
153
|
+
let mut archive = zip::ZipArchive::new(cursor)
|
|
154
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
|
|
155
|
+
let validator = ZipBombValidator::new(limits.clone());
|
|
156
|
+
validator
|
|
157
|
+
.validate(&mut archive)
|
|
158
|
+
.map_err(|e| crate::error::KreuzbergError::validation(e.to_string()))?;
|
|
159
|
+
|
|
160
|
+
let extraction_metadata = extract_zip_metadata(content, &limits)?;
|
|
161
|
+
let text_contents = extract_zip_text_content(content, &limits)?;
|
|
149
162
|
Ok(build_archive_result(
|
|
150
163
|
extraction_metadata,
|
|
151
164
|
text_contents,
|
|
@@ -210,7 +223,7 @@ impl Plugin for TarExtractor {
|
|
|
210
223
|
#[async_trait]
|
|
211
224
|
impl DocumentExtractor for TarExtractor {
|
|
212
225
|
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
213
|
-
skip(self, content,
|
|
226
|
+
skip(self, content, config),
|
|
214
227
|
fields(
|
|
215
228
|
extractor.name = self.name(),
|
|
216
229
|
content.size_bytes = content.len(),
|
|
@@ -220,10 +233,11 @@ impl DocumentExtractor for TarExtractor {
|
|
|
220
233
|
&self,
|
|
221
234
|
content: &[u8],
|
|
222
235
|
mime_type: &str,
|
|
223
|
-
|
|
236
|
+
config: &ExtractionConfig,
|
|
224
237
|
) -> Result<ExtractionResult> {
|
|
225
|
-
let
|
|
226
|
-
let
|
|
238
|
+
let limits = config.security_limits.clone().unwrap_or_default();
|
|
239
|
+
let extraction_metadata = extract_tar_metadata(content, &limits)?;
|
|
240
|
+
let text_contents = extract_tar_text_content(content, &limits)?;
|
|
227
241
|
Ok(build_archive_result(
|
|
228
242
|
extraction_metadata,
|
|
229
243
|
text_contents,
|
|
@@ -293,7 +307,7 @@ impl Plugin for SevenZExtractor {
|
|
|
293
307
|
#[async_trait]
|
|
294
308
|
impl DocumentExtractor for SevenZExtractor {
|
|
295
309
|
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
296
|
-
skip(self, content,
|
|
310
|
+
skip(self, content, config),
|
|
297
311
|
fields(
|
|
298
312
|
extractor.name = self.name(),
|
|
299
313
|
content.size_bytes = content.len(),
|
|
@@ -303,10 +317,11 @@ impl DocumentExtractor for SevenZExtractor {
|
|
|
303
317
|
&self,
|
|
304
318
|
content: &[u8],
|
|
305
319
|
mime_type: &str,
|
|
306
|
-
|
|
320
|
+
config: &ExtractionConfig,
|
|
307
321
|
) -> Result<ExtractionResult> {
|
|
308
|
-
let
|
|
309
|
-
let
|
|
322
|
+
let limits = config.security_limits.clone().unwrap_or_default();
|
|
323
|
+
let extraction_metadata = extract_7z_metadata(content, &limits)?;
|
|
324
|
+
let text_contents = extract_7z_text_content(content, &limits)?;
|
|
310
325
|
Ok(build_archive_result(
|
|
311
326
|
extraction_metadata,
|
|
312
327
|
text_contents,
|
|
@@ -324,6 +339,84 @@ impl DocumentExtractor for SevenZExtractor {
|
|
|
324
339
|
}
|
|
325
340
|
}
|
|
326
341
|
|
|
342
|
+
/// Gzip archive extractor.
|
|
343
|
+
///
|
|
344
|
+
/// Decompresses gzip files and extracts text content from the compressed data.
|
|
345
|
+
pub struct GzipExtractor;
|
|
346
|
+
|
|
347
|
+
impl GzipExtractor {
|
|
348
|
+
/// Create a new gzip extractor.
|
|
349
|
+
pub fn new() -> Self {
|
|
350
|
+
Self
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
impl Default for GzipExtractor {
|
|
355
|
+
fn default() -> Self {
|
|
356
|
+
Self::new()
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
impl Plugin for GzipExtractor {
|
|
361
|
+
fn name(&self) -> &str {
|
|
362
|
+
"gzip-extractor"
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
fn version(&self) -> String {
|
|
366
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
fn initialize(&self) -> Result<()> {
|
|
370
|
+
Ok(())
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
fn shutdown(&self) -> Result<()> {
|
|
374
|
+
Ok(())
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
fn description(&self) -> &str {
|
|
378
|
+
"Decompresses and extracts text content from gzip-compressed files"
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
fn author(&self) -> &str {
|
|
382
|
+
"Kreuzberg Team"
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
#[async_trait]
|
|
387
|
+
impl DocumentExtractor for GzipExtractor {
|
|
388
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
389
|
+
skip(self, content, config),
|
|
390
|
+
fields(
|
|
391
|
+
extractor.name = self.name(),
|
|
392
|
+
content.size_bytes = content.len(),
|
|
393
|
+
)
|
|
394
|
+
))]
|
|
395
|
+
async fn extract_bytes(
|
|
396
|
+
&self,
|
|
397
|
+
content: &[u8],
|
|
398
|
+
mime_type: &str,
|
|
399
|
+
config: &ExtractionConfig,
|
|
400
|
+
) -> Result<ExtractionResult> {
|
|
401
|
+
let limits = config.security_limits.clone().unwrap_or_default();
|
|
402
|
+
let (extraction_metadata, text_contents) = extract_gzip(content, &limits)?;
|
|
403
|
+
Ok(build_archive_result(
|
|
404
|
+
extraction_metadata,
|
|
405
|
+
text_contents,
|
|
406
|
+
"GZIP",
|
|
407
|
+
mime_type,
|
|
408
|
+
))
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
fn supported_mime_types(&self) -> &[&str] {
|
|
412
|
+
&["application/gzip", "application/x-gzip"]
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
fn priority(&self) -> i32 {
|
|
416
|
+
50
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
|
|
327
420
|
#[cfg(test)]
|
|
328
421
|
mod tests {
|
|
329
422
|
use super::*;
|
|
@@ -448,4 +541,42 @@ mod tests {
|
|
|
448
541
|
assert!(extractor.supported_mime_types().contains(&"application/tar"));
|
|
449
542
|
assert_eq!(extractor.priority(), 50);
|
|
450
543
|
}
|
|
544
|
+
|
|
545
|
+
#[test]
|
|
546
|
+
fn test_gzip_plugin_interface() {
|
|
547
|
+
let extractor = GzipExtractor::new();
|
|
548
|
+
assert_eq!(extractor.name(), "gzip-extractor");
|
|
549
|
+
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
550
|
+
assert!(extractor.supported_mime_types().contains(&"application/gzip"));
|
|
551
|
+
assert!(extractor.supported_mime_types().contains(&"application/x-gzip"));
|
|
552
|
+
assert_eq!(extractor.priority(), 50);
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
#[tokio::test]
|
|
556
|
+
async fn test_gzip_extractor_valid_data() {
|
|
557
|
+
use flate2::Compression;
|
|
558
|
+
use flate2::write::GzEncoder;
|
|
559
|
+
use std::io::Write;
|
|
560
|
+
|
|
561
|
+
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
|
|
562
|
+
encoder.write_all(b"Hello from gzip extraction!").unwrap();
|
|
563
|
+
let compressed = encoder.finish().unwrap();
|
|
564
|
+
|
|
565
|
+
let extractor = GzipExtractor::new();
|
|
566
|
+
let config = ExtractionConfig::default();
|
|
567
|
+
let result = extractor.extract_bytes(&compressed, "application/gzip", &config).await;
|
|
568
|
+
assert!(result.is_ok());
|
|
569
|
+
let extraction = result.unwrap();
|
|
570
|
+
assert!(extraction.content.contains("Hello from gzip extraction!"));
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
#[tokio::test]
|
|
574
|
+
async fn test_gzip_extractor_invalid_data() {
|
|
575
|
+
let extractor = GzipExtractor::new();
|
|
576
|
+
let config = ExtractionConfig::default();
|
|
577
|
+
let result = extractor
|
|
578
|
+
.extract_bytes(&[0, 1, 2, 3], "application/gzip", &config)
|
|
579
|
+
.await;
|
|
580
|
+
assert!(result.is_err());
|
|
581
|
+
}
|
|
451
582
|
}
|
|
@@ -180,7 +180,7 @@ impl DocumentExtractor for BibtexExtractor {
|
|
|
180
180
|
}
|
|
181
181
|
|
|
182
182
|
fn supported_mime_types(&self) -> &[&str] {
|
|
183
|
-
&["application/x-bibtex", "text/x-bibtex"]
|
|
183
|
+
&["application/x-bibtex", "text/x-bibtex", "application/x-biblatex"]
|
|
184
184
|
}
|
|
185
185
|
|
|
186
186
|
fn priority(&self) -> i32 {
|
|
@@ -199,7 +199,8 @@ mod tests {
|
|
|
199
199
|
|
|
200
200
|
assert!(supported.contains(&"application/x-bibtex"));
|
|
201
201
|
assert!(supported.contains(&"text/x-bibtex"));
|
|
202
|
-
|
|
202
|
+
assert!(supported.contains(&"application/x-biblatex"));
|
|
203
|
+
assert_eq!(supported.len(), 3);
|
|
203
204
|
}
|
|
204
205
|
|
|
205
206
|
#[tokio::test]
|