kreuzberg 4.2.12 → 4.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +24 -7
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
- data/vendor/kreuzberg/src/core/mime.rs +47 -2
- data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
- data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
- data/vendor/kreuzberg/src/extraction/image.rs +405 -18
- data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
- data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
- data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
- data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
- data/vendor/kreuzberg/src/extractors/image.rs +25 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
- data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
- data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
- data/vendor/kreuzberg/src/extractors/security.rs +2 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
- data/vendor/kreuzberg/src/extractors/text.rs +33 -4
- data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3c2053b10256948a215ff0d3552894991e801497ac4b2480eca3c98bb645cc27
|
|
4
|
+
data.tar.gz: 324b6147e172ecedb2338fab1b14ce2022a8b9c2d6be7fd86ac0f862d81ef7ce
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 84a6636111d240c99eb17546f80c1df31117c700d78282c18a67a79aa613021d33988cbc1b00d5bc62bb2ffeef8c2a8f1759e137329de8f30af7f61b6db1a55b
|
|
7
|
+
data.tar.gz: 7628ecce3c6fb44c06a9546f2db696ae3486de35e0a05195cbea752bc6f78e573162e6305aec8c8ae0ca0fdbb6709a3e75752b822dbd8aed637eff9577c3e020
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.13)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -209,7 +209,7 @@ CHECKSUMS
|
|
|
209
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
210
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
211
211
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
212
|
-
kreuzberg (4.2.
|
|
212
|
+
kreuzberg (4.2.13)
|
|
213
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
214
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
215
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.13" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "4.2.
|
|
6
|
+
version = "4.2.13"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.91"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -49,7 +49,7 @@ toml = "0.9.11"
|
|
|
49
49
|
num_cpus = "1.17.0"
|
|
50
50
|
once_cell = "1.21.3"
|
|
51
51
|
html-to-markdown-rs = { version = "2.24.5", default-features = false }
|
|
52
|
-
reqwest = { version = "0.13.
|
|
52
|
+
reqwest = { version = "0.13.2", default-features = false, features = ["json", "rustls"] }
|
|
53
53
|
image = { version = "0.25.9", default-features = false }
|
|
54
54
|
lzma-rust2 = { version = "0.15.7" }
|
|
55
55
|
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.13"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -38,18 +38,18 @@ office = [
|
|
|
38
38
|
"dep:quick-xml",
|
|
39
39
|
"dep:pulldown-cmark",
|
|
40
40
|
"dep:biblatex",
|
|
41
|
+
"dep:biblib",
|
|
41
42
|
"dep:org",
|
|
42
43
|
"dep:rtf-parser",
|
|
43
44
|
"dep:rst_parser",
|
|
44
45
|
"dep:fb2",
|
|
45
46
|
"dep:typst-syntax",
|
|
46
47
|
"html",
|
|
47
|
-
"tokio-runtime",
|
|
48
48
|
]
|
|
49
49
|
email = ["dep:mail-parser", "dep:msg_parser"]
|
|
50
50
|
html = ["dep:html-to-markdown-rs"]
|
|
51
51
|
xml = ["dep:quick-xml", "dep:roxmltree"]
|
|
52
|
-
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2"]
|
|
52
|
+
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2", "dep:flate2"]
|
|
53
53
|
|
|
54
54
|
ocr = [
|
|
55
55
|
"dep:kreuzberg-tesseract",
|
|
@@ -58,6 +58,8 @@ ocr = [
|
|
|
58
58
|
"dep:fast_image_resize",
|
|
59
59
|
"dep:ndarray",
|
|
60
60
|
"dep:kamadak-exif",
|
|
61
|
+
"dep:hayro-jpeg2000",
|
|
62
|
+
"dep:hayro-jbig2",
|
|
61
63
|
"html",
|
|
62
64
|
]
|
|
63
65
|
language-detection = ["dep:whatlang"]
|
|
@@ -76,7 +78,7 @@ mcp-http = ["mcp", "api"]
|
|
|
76
78
|
|
|
77
79
|
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
|
|
78
80
|
|
|
79
|
-
wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality"]
|
|
81
|
+
wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality", "office"]
|
|
80
82
|
wasm-threads = ["dep:wasm-bindgen-rayon"]
|
|
81
83
|
|
|
82
84
|
full = [
|
|
@@ -126,7 +128,7 @@ simdutf8 = { version = "0.1", optional = true }
|
|
|
126
128
|
hex = { workspace = true }
|
|
127
129
|
lazy_static = "1.5.0"
|
|
128
130
|
libc = { workspace = true }
|
|
129
|
-
memchr = "2.
|
|
131
|
+
memchr = "2.8.0"
|
|
130
132
|
num_cpus = { workspace = true }
|
|
131
133
|
once_cell = { workspace = true }
|
|
132
134
|
parking_lot = { workspace = true }
|
|
@@ -153,7 +155,9 @@ lopdf = { version = "0.39.0", optional = true }
|
|
|
153
155
|
calamine = { version = "0.33.0", features = ["dates"], optional = true }
|
|
154
156
|
polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
|
|
155
157
|
roxmltree = { version = "0.21.1", optional = true }
|
|
156
|
-
zip = { version = "7.4.0", optional = true
|
|
158
|
+
zip = { version = "7.4.0", optional = true, default-features = false, features = [
|
|
159
|
+
"deflate-flate2",
|
|
160
|
+
] }
|
|
157
161
|
mail-parser = { version = "0.11.1", optional = true }
|
|
158
162
|
msg_parser = { version = "0.1.1", optional = true }
|
|
159
163
|
html-to-markdown-rs = { workspace = true, features = [
|
|
@@ -164,9 +168,16 @@ quick-xml = { version = "0.39.0", features = ["serialize"], optional = true }
|
|
|
164
168
|
tar = { version = "0.4.44", optional = true }
|
|
165
169
|
sevenz-rust2 = { version = "0.20.1", optional = true }
|
|
166
170
|
lzma-rust2 = { workspace = true, optional = true }
|
|
171
|
+
flate2 = { version = "1.0", optional = true }
|
|
167
172
|
|
|
168
173
|
pulldown-cmark = { version = "0.13", optional = true }
|
|
169
174
|
biblatex = { version = "0.11", optional = true }
|
|
175
|
+
biblib = { version = "0.3", default-features = false, features = [
|
|
176
|
+
"ris",
|
|
177
|
+
"pubmed",
|
|
178
|
+
"xml",
|
|
179
|
+
"regex",
|
|
180
|
+
], optional = true }
|
|
170
181
|
org = { version = "0.3", optional = true }
|
|
171
182
|
rtf-parser = { version = "0.4", optional = true }
|
|
172
183
|
rst_parser = { version = "0.4", optional = true }
|
|
@@ -181,12 +192,18 @@ image = { workspace = true, default-features = false, features = [
|
|
|
181
192
|
"bmp",
|
|
182
193
|
"tiff",
|
|
183
194
|
"gif",
|
|
195
|
+
"pnm",
|
|
184
196
|
"rayon",
|
|
185
197
|
], optional = true }
|
|
186
198
|
tiff = { version = "0.11", optional = true }
|
|
187
199
|
fast_image_resize = { version = "6.0.0", optional = true }
|
|
188
200
|
ndarray = { version = "0.17.2", optional = true }
|
|
189
201
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
202
|
+
hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
|
203
|
+
"std",
|
|
204
|
+
"simd",
|
|
205
|
+
], optional = true }
|
|
206
|
+
hayro-jbig2 = { version = "0.1", default-features = false, features = ["std"], optional = true }
|
|
190
207
|
whatlang = { version = "0.18.0", optional = true }
|
|
191
208
|
text-splitter = { version = "0.29.3", features = ["markdown"], optional = true }
|
|
192
209
|
unicode-normalization = { version = "0.1.25", optional = true }
|
|
@@ -216,7 +233,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
|
|
|
216
233
|
tempfile = { workspace = true }
|
|
217
234
|
filetime = "0.2"
|
|
218
235
|
tar = "0.4.44"
|
|
219
|
-
zip = "7.4.0"
|
|
236
|
+
zip = { version = "7.4.0", default-features = false, features = ["deflate-flate2"] }
|
|
220
237
|
serial_test = "3.3.1"
|
|
221
238
|
anyhow = { workspace = true }
|
|
222
239
|
tokio-test = "0.4"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.13 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -102,6 +102,15 @@ pub struct ExtractionConfig {
|
|
|
102
102
|
#[serde(default)]
|
|
103
103
|
pub result_format: crate::types::OutputFormat,
|
|
104
104
|
|
|
105
|
+
/// Security limits for archive extraction.
|
|
106
|
+
///
|
|
107
|
+
/// Controls maximum archive size, compression ratio, file count, and other
|
|
108
|
+
/// security thresholds to prevent decompression bomb attacks.
|
|
109
|
+
/// When `None`, default limits are used (500MB archive, 100:1 ratio, 10K files).
|
|
110
|
+
#[cfg(feature = "archives")]
|
|
111
|
+
#[serde(default)]
|
|
112
|
+
pub security_limits: Option<crate::extractors::security::SecurityLimits>,
|
|
113
|
+
|
|
105
114
|
/// Content text format (default: Plain).
|
|
106
115
|
///
|
|
107
116
|
/// Controls the format of the extracted content:
|
|
@@ -137,6 +146,8 @@ impl Default for ExtractionConfig {
|
|
|
137
146
|
#[cfg(feature = "html")]
|
|
138
147
|
html_options: None,
|
|
139
148
|
max_concurrent_extractions: None,
|
|
149
|
+
#[cfg(feature = "archives")]
|
|
150
|
+
security_limits: None,
|
|
140
151
|
result_format: crate::types::OutputFormat::Unified,
|
|
141
152
|
output_format: OutputFormat::Plain,
|
|
142
153
|
}
|
|
@@ -5,16 +5,16 @@
|
|
|
5
5
|
//! - Legacy format conversion (DOC, PPT)
|
|
6
6
|
//! - Extraction pipeline orchestration
|
|
7
7
|
|
|
8
|
-
#[cfg(not(feature = "office"))]
|
|
8
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
9
9
|
use crate::KreuzbergError;
|
|
10
10
|
use crate::Result;
|
|
11
11
|
use crate::core::config::ExtractionConfig;
|
|
12
12
|
use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
|
|
13
|
-
#[cfg(feature = "office")]
|
|
13
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
14
14
|
use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
|
|
15
15
|
use crate::types::ExtractionResult;
|
|
16
16
|
|
|
17
|
-
#[cfg(feature = "office")]
|
|
17
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
18
18
|
use super::file::apply_libreoffice_metadata;
|
|
19
19
|
use super::file::extract_bytes_with_extractor;
|
|
20
20
|
#[cfg(feature = "otel")]
|
|
@@ -72,7 +72,7 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
72
72
|
let validated_mime = mime::validate_mime_type(mime_type)?;
|
|
73
73
|
|
|
74
74
|
match validated_mime.as_str() {
|
|
75
|
-
#[cfg(feature = "office")]
|
|
75
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
76
76
|
LEGACY_WORD_MIME_TYPE => {
|
|
77
77
|
let conversion = convert_doc_to_docx(content).await?;
|
|
78
78
|
let mut result =
|
|
@@ -80,13 +80,13 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
80
80
|
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
81
81
|
return Ok(result);
|
|
82
82
|
}
|
|
83
|
-
#[cfg(not(feature = "office"))]
|
|
83
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
84
84
|
LEGACY_WORD_MIME_TYPE => {
|
|
85
85
|
return Err(KreuzbergError::UnsupportedFormat(
|
|
86
86
|
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
87
87
|
));
|
|
88
88
|
}
|
|
89
|
-
#[cfg(feature = "office")]
|
|
89
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
90
90
|
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
91
91
|
let conversion = convert_ppt_to_pptx(content).await?;
|
|
92
92
|
let mut result =
|
|
@@ -94,7 +94,7 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
94
94
|
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
95
95
|
return Ok(result);
|
|
96
96
|
}
|
|
97
|
-
#[cfg(not(feature = "office"))]
|
|
97
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
98
98
|
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
99
99
|
return Err(KreuzbergError::UnsupportedFormat(
|
|
100
100
|
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
@@ -6,23 +6,23 @@
|
|
|
6
6
|
//! - File validation and reading
|
|
7
7
|
//! - Extraction pipeline orchestration
|
|
8
8
|
|
|
9
|
-
#[cfg(any(feature = "otel", not(feature = "office")))]
|
|
9
|
+
#[cfg(any(feature = "otel", not(all(feature = "office", not(target_arch = "wasm32")))))]
|
|
10
10
|
use crate::KreuzbergError;
|
|
11
11
|
use crate::Result;
|
|
12
12
|
use crate::core::config::ExtractionConfig;
|
|
13
13
|
use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
|
|
14
|
-
#[cfg(feature = "office")]
|
|
14
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
15
15
|
use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
|
|
16
16
|
use crate::types::ExtractionResult;
|
|
17
|
-
#[cfg(feature = "office")]
|
|
17
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
18
18
|
use crate::types::LibreOfficeConversionResult;
|
|
19
|
-
#[cfg(feature = "office")]
|
|
19
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
20
20
|
use serde_json::json;
|
|
21
|
-
#[cfg(feature = "office")]
|
|
21
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
22
22
|
use std::borrow::Cow;
|
|
23
23
|
use std::path::Path;
|
|
24
24
|
|
|
25
|
-
#[cfg(feature = "office")]
|
|
25
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
26
26
|
use super::helpers::pool_mime_type;
|
|
27
27
|
|
|
28
28
|
use super::helpers::get_extractor;
|
|
@@ -151,7 +151,7 @@ pub async fn extract_file(
|
|
|
151
151
|
let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
|
|
152
152
|
|
|
153
153
|
match detected_mime.as_str() {
|
|
154
|
-
#[cfg(feature = "office")]
|
|
154
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
155
155
|
LEGACY_WORD_MIME_TYPE => {
|
|
156
156
|
let original_bytes = tokio::fs::read(path).await?;
|
|
157
157
|
let conversion = convert_doc_to_docx(&original_bytes).await?;
|
|
@@ -160,13 +160,13 @@ pub async fn extract_file(
|
|
|
160
160
|
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
161
161
|
return Ok(result);
|
|
162
162
|
}
|
|
163
|
-
#[cfg(not(feature = "office"))]
|
|
163
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
164
164
|
LEGACY_WORD_MIME_TYPE => {
|
|
165
165
|
return Err(KreuzbergError::UnsupportedFormat(
|
|
166
166
|
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
167
167
|
));
|
|
168
168
|
}
|
|
169
|
-
#[cfg(feature = "office")]
|
|
169
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
170
170
|
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
171
171
|
let original_bytes = tokio::fs::read(path).await?;
|
|
172
172
|
let conversion = convert_ppt_to_pptx(&original_bytes).await?;
|
|
@@ -175,7 +175,7 @@ pub async fn extract_file(
|
|
|
175
175
|
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
176
176
|
return Ok(result);
|
|
177
177
|
}
|
|
178
|
-
#[cfg(not(feature = "office"))]
|
|
178
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
179
179
|
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
180
180
|
return Err(KreuzbergError::UnsupportedFormat(
|
|
181
181
|
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
@@ -222,7 +222,7 @@ pub(in crate::core::extractor) async fn extract_bytes_with_extractor(
|
|
|
222
222
|
Ok(result)
|
|
223
223
|
}
|
|
224
224
|
|
|
225
|
-
#[cfg(feature = "office")]
|
|
225
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
226
226
|
pub(in crate::core::extractor) fn apply_libreoffice_metadata(
|
|
227
227
|
result: &mut ExtractionResult,
|
|
228
228
|
legacy_mime: &str,
|
|
@@ -80,6 +80,10 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
|
|
|
80
80
|
m.insert("jpx", "image/jpx");
|
|
81
81
|
m.insert("jpm", "image/jpm");
|
|
82
82
|
m.insert("mj2", "image/mj2");
|
|
83
|
+
m.insert("j2k", "image/jp2");
|
|
84
|
+
m.insert("j2c", "image/jp2");
|
|
85
|
+
m.insert("jbig2", "image/x-jbig2");
|
|
86
|
+
m.insert("jb2", "image/x-jbig2");
|
|
83
87
|
m.insert("pnm", "image/x-portable-anymap");
|
|
84
88
|
m.insert("pbm", "image/x-portable-bitmap");
|
|
85
89
|
m.insert("pgm", "image/x-portable-graymap");
|
|
@@ -108,10 +112,18 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
|
|
|
108
112
|
m.insert("epub", "application/epub+zip");
|
|
109
113
|
m.insert("rtf", "application/rtf");
|
|
110
114
|
m.insert("bib", "application/x-bibtex");
|
|
115
|
+
m.insert("ris", "application/x-research-info-systems");
|
|
116
|
+
m.insert("nbib", "application/x-pubmed");
|
|
117
|
+
m.insert("enw", "application/x-endnote+xml");
|
|
118
|
+
m.insert("fb2", "application/x-fictionbook+xml");
|
|
119
|
+
m.insert("opml", "application/xml+opml");
|
|
120
|
+
m.insert("dbk", "application/docbook+xml");
|
|
111
121
|
m.insert("ipynb", "application/x-ipynb+json");
|
|
112
122
|
m.insert("tex", "application/x-latex");
|
|
113
123
|
m.insert("latex", "application/x-latex");
|
|
114
124
|
m.insert("typst", "application/x-typst");
|
|
125
|
+
m.insert("typ", "application/x-typst");
|
|
126
|
+
m.insert("djot", "text/x-djot");
|
|
115
127
|
m.insert("commonmark", "text/x-commonmark");
|
|
116
128
|
|
|
117
129
|
m
|
|
@@ -137,6 +149,7 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
137
149
|
set.insert("image/tiff");
|
|
138
150
|
set.insert("image/webp");
|
|
139
151
|
set.insert("image/x-bmp");
|
|
152
|
+
set.insert("image/x-jbig2");
|
|
140
153
|
set.insert("image/x-ms-bmp");
|
|
141
154
|
set.insert("image/x-portable-anymap");
|
|
142
155
|
set.insert("image/x-portable-bitmap");
|
|
@@ -146,20 +159,25 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
146
159
|
|
|
147
160
|
set.insert("application/csl+json");
|
|
148
161
|
set.insert("application/docbook+xml");
|
|
162
|
+
set.insert("text/docbook");
|
|
149
163
|
set.insert("application/epub+zip");
|
|
150
164
|
set.insert("application/rtf");
|
|
151
165
|
set.insert("application/vnd.oasis.opendocument.text");
|
|
152
166
|
set.insert(DOCX_MIME_TYPE);
|
|
153
167
|
set.insert("application/x-biblatex");
|
|
154
168
|
set.insert("application/x-bibtex");
|
|
169
|
+
set.insert("text/x-bibtex");
|
|
155
170
|
set.insert("application/x-endnote+xml");
|
|
156
171
|
set.insert("application/x-fictionbook+xml");
|
|
172
|
+
set.insert("application/x-fictionbook");
|
|
173
|
+
set.insert("text/x-fictionbook");
|
|
157
174
|
set.insert("application/x-ipynb+json");
|
|
158
175
|
set.insert("application/x-jats+xml");
|
|
159
176
|
set.insert("application/x-latex");
|
|
160
177
|
set.insert("application/xml+opml");
|
|
161
178
|
set.insert("application/x-opml+xml");
|
|
162
179
|
set.insert("application/x-research-info-systems");
|
|
180
|
+
set.insert("application/x-pubmed");
|
|
163
181
|
set.insert("application/x-typst");
|
|
164
182
|
set.insert("text/csv");
|
|
165
183
|
set.insert("text/tab-separated-values");
|
|
@@ -210,8 +228,26 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
210
228
|
set.insert("application/tar");
|
|
211
229
|
set.insert("application/x-gtar");
|
|
212
230
|
set.insert("application/x-ustar");
|
|
231
|
+
set.insert("application/gzip");
|
|
232
|
+
set.insert("application/x-gzip");
|
|
213
233
|
set.insert("application/x-7z-compressed");
|
|
214
234
|
|
|
235
|
+
set.insert("text/djot");
|
|
236
|
+
set.insert("text/x-djot");
|
|
237
|
+
|
|
238
|
+
// Additional extractor-supported MIME types that must stay in sync
|
|
239
|
+
set.insert("text/jats");
|
|
240
|
+
set.insert("application/x-epub+zip");
|
|
241
|
+
set.insert("application/vnd.epub+zip");
|
|
242
|
+
set.insert("text/rtf");
|
|
243
|
+
set.insert("text/prs.fallenstein.rst");
|
|
244
|
+
set.insert("text/x-tex");
|
|
245
|
+
set.insert("text/org");
|
|
246
|
+
set.insert("application/x-org");
|
|
247
|
+
set.insert("application/xhtml+xml");
|
|
248
|
+
set.insert("text/x-typst");
|
|
249
|
+
set.insert("image/jpg");
|
|
250
|
+
|
|
215
251
|
set
|
|
216
252
|
});
|
|
217
253
|
|
|
@@ -291,6 +327,15 @@ pub fn validate_mime_type(mime_type: &str) -> Result<String> {
|
|
|
291
327
|
return Ok(mime_type.to_string());
|
|
292
328
|
}
|
|
293
329
|
|
|
330
|
+
// Case-insensitive fallback: MIME types are case-insensitive per RFC 2045.
|
|
331
|
+
// This handles common mismatches like "macroEnabled" vs "macroenabled".
|
|
332
|
+
let lower = mime_type.to_ascii_lowercase();
|
|
333
|
+
for supported in SUPPORTED_MIME_TYPES.iter() {
|
|
334
|
+
if supported.to_ascii_lowercase() == lower {
|
|
335
|
+
return Ok(supported.to_string());
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
294
339
|
Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
|
|
295
340
|
}
|
|
296
341
|
|
|
@@ -621,8 +666,8 @@ mod tests {
|
|
|
621
666
|
let file_path = dir.path().join("testfile");
|
|
622
667
|
File::create(&file_path).unwrap();
|
|
623
668
|
|
|
624
|
-
let
|
|
625
|
-
|
|
669
|
+
let _result = detect_mime_type(&file_path, true);
|
|
670
|
+
// Files without extensions may or may not be detected via mime_guess fallback
|
|
626
671
|
}
|
|
627
672
|
|
|
628
673
|
#[test]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
//! Gzip decompression and extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides functions for decompressing gzip files and extracting
|
|
4
|
+
//! metadata and text content from the compressed data.
|
|
5
|
+
|
|
6
|
+
use super::{ArchiveEntry, ArchiveMetadata};
|
|
7
|
+
use crate::error::{KreuzbergError, Result};
|
|
8
|
+
use crate::extractors::security::SecurityLimits;
|
|
9
|
+
use flate2::read::GzDecoder;
|
|
10
|
+
use std::collections::HashMap;
|
|
11
|
+
use std::io::Read;
|
|
12
|
+
|
|
13
|
+
/// Decompress gzip bytes with a size limit to prevent decompression bombs.
|
|
14
|
+
fn decompress_gzip_limited(bytes: &[u8], max_size: u64) -> Result<Vec<u8>> {
|
|
15
|
+
let decoder = GzDecoder::new(bytes);
|
|
16
|
+
let mut limited = decoder.take(max_size + 1);
|
|
17
|
+
let mut decompressed = Vec::new();
|
|
18
|
+
limited
|
|
19
|
+
.read_to_end(&mut decompressed)
|
|
20
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to decompress gzip: {}", e)))?;
|
|
21
|
+
|
|
22
|
+
if decompressed.len() as u64 > max_size {
|
|
23
|
+
return Err(KreuzbergError::validation(format!(
|
|
24
|
+
"Gzip decompressed size exceeds {} byte limit",
|
|
25
|
+
max_size
|
|
26
|
+
)));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
Ok(decompressed)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/// Decompress gzip bytes, returning the raw decompressed data.
|
|
33
|
+
pub fn decompress_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<Vec<u8>> {
|
|
34
|
+
decompress_gzip_limited(bytes, limits.max_archive_size as u64)
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/// Extract both metadata and text content from gzip in a single decompression pass.
|
|
38
|
+
///
|
|
39
|
+
/// This avoids the overhead of decompressing the data multiple times when both
|
|
40
|
+
/// metadata and text content are needed.
|
|
41
|
+
pub fn extract_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<(ArchiveMetadata, HashMap<String, String>)> {
|
|
42
|
+
let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
|
|
43
|
+
|
|
44
|
+
// Re-read header for filename (lightweight - no decompression)
|
|
45
|
+
let mut decoder = GzDecoder::new(bytes);
|
|
46
|
+
let mut _discard = [0u8; 1];
|
|
47
|
+
let _ = decoder.read(&mut _discard); // trigger header read
|
|
48
|
+
let filename = decoder
|
|
49
|
+
.header()
|
|
50
|
+
.and_then(|h| h.filename())
|
|
51
|
+
.and_then(|f| std::str::from_utf8(f).ok())
|
|
52
|
+
.unwrap_or("compressed_content")
|
|
53
|
+
.to_string();
|
|
54
|
+
|
|
55
|
+
let size = decompressed.len() as u64;
|
|
56
|
+
|
|
57
|
+
let metadata = ArchiveMetadata {
|
|
58
|
+
format: "GZIP".to_string(),
|
|
59
|
+
file_list: vec![ArchiveEntry {
|
|
60
|
+
path: filename.clone(),
|
|
61
|
+
size,
|
|
62
|
+
is_dir: false,
|
|
63
|
+
}],
|
|
64
|
+
file_count: 1,
|
|
65
|
+
total_size: size,
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
let mut contents = HashMap::new();
|
|
69
|
+
if let Ok(text) = String::from_utf8(decompressed) {
|
|
70
|
+
contents.insert(filename, text);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
Ok((metadata, contents))
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/// Extract metadata from a gzip-compressed file.
|
|
77
|
+
///
|
|
78
|
+
/// Gzip wraps a single stream, so the metadata contains one entry
|
|
79
|
+
/// with the original filename (from gzip header) and decompressed size.
|
|
80
|
+
pub fn extract_gzip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
|
|
81
|
+
let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
|
|
82
|
+
|
|
83
|
+
let mut decoder = GzDecoder::new(bytes);
|
|
84
|
+
let mut _discard = [0u8; 1];
|
|
85
|
+
let _ = decoder.read(&mut _discard);
|
|
86
|
+
let filename = decoder
|
|
87
|
+
.header()
|
|
88
|
+
.and_then(|h| h.filename())
|
|
89
|
+
.and_then(|f| std::str::from_utf8(f).ok())
|
|
90
|
+
.unwrap_or("compressed_content")
|
|
91
|
+
.to_string();
|
|
92
|
+
|
|
93
|
+
let size = decompressed.len() as u64;
|
|
94
|
+
|
|
95
|
+
Ok(ArchiveMetadata {
|
|
96
|
+
format: "GZIP".to_string(),
|
|
97
|
+
file_list: vec![ArchiveEntry {
|
|
98
|
+
path: filename,
|
|
99
|
+
size,
|
|
100
|
+
is_dir: false,
|
|
101
|
+
}],
|
|
102
|
+
file_count: 1,
|
|
103
|
+
total_size: size,
|
|
104
|
+
})
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/// Extract text content from a gzip-compressed file.
|
|
108
|
+
///
|
|
109
|
+
/// Decompresses and attempts to read the result as UTF-8 text.
|
|
110
|
+
pub fn extract_gzip_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
|
|
111
|
+
let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
|
|
112
|
+
|
|
113
|
+
let mut decoder = GzDecoder::new(bytes);
|
|
114
|
+
let mut _discard = [0u8; 1];
|
|
115
|
+
let _ = decoder.read(&mut _discard);
|
|
116
|
+
let filename = decoder
|
|
117
|
+
.header()
|
|
118
|
+
.and_then(|h| h.filename())
|
|
119
|
+
.and_then(|f| std::str::from_utf8(f).ok())
|
|
120
|
+
.unwrap_or("compressed_content")
|
|
121
|
+
.to_string();
|
|
122
|
+
|
|
123
|
+
let mut contents = HashMap::new();
|
|
124
|
+
if let Ok(text) = String::from_utf8(decompressed) {
|
|
125
|
+
contents.insert(filename, text);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
Ok(contents)
|
|
129
|
+
}
|