kreuzberg 4.2.11 → 4.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +24 -9
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
- data/vendor/kreuzberg/src/core/mime.rs +47 -2
- data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
- data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
- data/vendor/kreuzberg/src/extraction/{docx.rs → docx/mod.rs} +7 -17
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +686 -0
- data/vendor/kreuzberg/src/extraction/image.rs +405 -18
- data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
- data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
- data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
- data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +10 -22
- data/vendor/kreuzberg/src/extractors/image.rs +25 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
- data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
- data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
- data/vendor/kreuzberg/src/extractors/security.rs +2 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
- data/vendor/kreuzberg/src/extractors/text.rs +33 -4
- data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
- data/vendor/kreuzberg/tests/issue_359_list_whitespace_test.rs +33 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +7 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3c2053b10256948a215ff0d3552894991e801497ac4b2480eca3c98bb645cc27
|
|
4
|
+
data.tar.gz: 324b6147e172ecedb2338fab1b14ce2022a8b9c2d6be7fd86ac0f862d81ef7ce
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 84a6636111d240c99eb17546f80c1df31117c700d78282c18a67a79aa613021d33988cbc1b00d5bc62bb2ffeef8c2a8f1759e137329de8f30af7f61b6db1a55b
|
|
7
|
+
data.tar.gz: 7628ecce3c6fb44c06a9546f2db696ae3486de35e0a05195cbea752bc6f78e573162e6305aec8c8ae0ca0fdbb6709a3e75752b822dbd8aed637eff9577c3e020
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.13)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -123,7 +123,7 @@ GEM
|
|
|
123
123
|
rubocop (~> 1.81)
|
|
124
124
|
ruby-progressbar (1.13.0)
|
|
125
125
|
securerandom (0.4.1)
|
|
126
|
-
sorbet-runtime (0.6.
|
|
126
|
+
sorbet-runtime (0.6.12925)
|
|
127
127
|
steep (1.10.0)
|
|
128
128
|
activesupport (>= 5.1)
|
|
129
129
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -209,7 +209,7 @@ CHECKSUMS
|
|
|
209
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
210
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
211
211
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
212
|
-
kreuzberg (4.2.
|
|
212
|
+
kreuzberg (4.2.13)
|
|
213
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
214
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
215
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -244,7 +244,7 @@ CHECKSUMS
|
|
|
244
244
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
245
245
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
246
246
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
247
|
-
sorbet-runtime (0.6.
|
|
247
|
+
sorbet-runtime (0.6.12925) sha256=ddd6fb1d8aaf6bc19119ffadbc4b96536f3d6766fa82059112dacb90977c6eca
|
|
248
248
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
249
249
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
250
250
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.13" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "4.2.
|
|
6
|
+
version = "4.2.13"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.91"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -49,7 +49,7 @@ toml = "0.9.11"
|
|
|
49
49
|
num_cpus = "1.17.0"
|
|
50
50
|
once_cell = "1.21.3"
|
|
51
51
|
html-to-markdown-rs = { version = "2.24.5", default-features = false }
|
|
52
|
-
reqwest = { version = "0.13.
|
|
52
|
+
reqwest = { version = "0.13.2", default-features = false, features = ["json", "rustls"] }
|
|
53
53
|
image = { version = "0.25.9", default-features = false }
|
|
54
54
|
lzma-rust2 = { version = "0.15.7" }
|
|
55
55
|
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.13"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -35,22 +35,21 @@ excel = ["dep:calamine", "dep:polars", "tokio-runtime"]
|
|
|
35
35
|
office = [
|
|
36
36
|
"dep:roxmltree",
|
|
37
37
|
"dep:zip",
|
|
38
|
-
"dep:docx-lite",
|
|
39
38
|
"dep:quick-xml",
|
|
40
39
|
"dep:pulldown-cmark",
|
|
41
40
|
"dep:biblatex",
|
|
41
|
+
"dep:biblib",
|
|
42
42
|
"dep:org",
|
|
43
43
|
"dep:rtf-parser",
|
|
44
44
|
"dep:rst_parser",
|
|
45
45
|
"dep:fb2",
|
|
46
46
|
"dep:typst-syntax",
|
|
47
47
|
"html",
|
|
48
|
-
"tokio-runtime",
|
|
49
48
|
]
|
|
50
49
|
email = ["dep:mail-parser", "dep:msg_parser"]
|
|
51
50
|
html = ["dep:html-to-markdown-rs"]
|
|
52
51
|
xml = ["dep:quick-xml", "dep:roxmltree"]
|
|
53
|
-
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2"]
|
|
52
|
+
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2", "dep:flate2"]
|
|
54
53
|
|
|
55
54
|
ocr = [
|
|
56
55
|
"dep:kreuzberg-tesseract",
|
|
@@ -59,6 +58,8 @@ ocr = [
|
|
|
59
58
|
"dep:fast_image_resize",
|
|
60
59
|
"dep:ndarray",
|
|
61
60
|
"dep:kamadak-exif",
|
|
61
|
+
"dep:hayro-jpeg2000",
|
|
62
|
+
"dep:hayro-jbig2",
|
|
62
63
|
"html",
|
|
63
64
|
]
|
|
64
65
|
language-detection = ["dep:whatlang"]
|
|
@@ -77,7 +78,7 @@ mcp-http = ["mcp", "api"]
|
|
|
77
78
|
|
|
78
79
|
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
|
|
79
80
|
|
|
80
|
-
wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality"]
|
|
81
|
+
wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality", "office"]
|
|
81
82
|
wasm-threads = ["dep:wasm-bindgen-rayon"]
|
|
82
83
|
|
|
83
84
|
full = [
|
|
@@ -127,7 +128,7 @@ simdutf8 = { version = "0.1", optional = true }
|
|
|
127
128
|
hex = { workspace = true }
|
|
128
129
|
lazy_static = "1.5.0"
|
|
129
130
|
libc = { workspace = true }
|
|
130
|
-
memchr = "2.
|
|
131
|
+
memchr = "2.8.0"
|
|
131
132
|
num_cpus = { workspace = true }
|
|
132
133
|
once_cell = { workspace = true }
|
|
133
134
|
parking_lot = { workspace = true }
|
|
@@ -154,7 +155,9 @@ lopdf = { version = "0.39.0", optional = true }
|
|
|
154
155
|
calamine = { version = "0.33.0", features = ["dates"], optional = true }
|
|
155
156
|
polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
|
|
156
157
|
roxmltree = { version = "0.21.1", optional = true }
|
|
157
|
-
zip = { version = "7.
|
|
158
|
+
zip = { version = "7.4.0", optional = true, default-features = false, features = [
|
|
159
|
+
"deflate-flate2",
|
|
160
|
+
] }
|
|
158
161
|
mail-parser = { version = "0.11.1", optional = true }
|
|
159
162
|
msg_parser = { version = "0.1.1", optional = true }
|
|
160
163
|
html-to-markdown-rs = { workspace = true, features = [
|
|
@@ -165,10 +168,16 @@ quick-xml = { version = "0.39.0", features = ["serialize"], optional = true }
|
|
|
165
168
|
tar = { version = "0.4.44", optional = true }
|
|
166
169
|
sevenz-rust2 = { version = "0.20.1", optional = true }
|
|
167
170
|
lzma-rust2 = { workspace = true, optional = true }
|
|
168
|
-
|
|
171
|
+
flate2 = { version = "1.0", optional = true }
|
|
169
172
|
|
|
170
173
|
pulldown-cmark = { version = "0.13", optional = true }
|
|
171
174
|
biblatex = { version = "0.11", optional = true }
|
|
175
|
+
biblib = { version = "0.3", default-features = false, features = [
|
|
176
|
+
"ris",
|
|
177
|
+
"pubmed",
|
|
178
|
+
"xml",
|
|
179
|
+
"regex",
|
|
180
|
+
], optional = true }
|
|
172
181
|
org = { version = "0.3", optional = true }
|
|
173
182
|
rtf-parser = { version = "0.4", optional = true }
|
|
174
183
|
rst_parser = { version = "0.4", optional = true }
|
|
@@ -183,12 +192,18 @@ image = { workspace = true, default-features = false, features = [
|
|
|
183
192
|
"bmp",
|
|
184
193
|
"tiff",
|
|
185
194
|
"gif",
|
|
195
|
+
"pnm",
|
|
186
196
|
"rayon",
|
|
187
197
|
], optional = true }
|
|
188
198
|
tiff = { version = "0.11", optional = true }
|
|
189
199
|
fast_image_resize = { version = "6.0.0", optional = true }
|
|
190
200
|
ndarray = { version = "0.17.2", optional = true }
|
|
191
201
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
202
|
+
hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
|
203
|
+
"std",
|
|
204
|
+
"simd",
|
|
205
|
+
], optional = true }
|
|
206
|
+
hayro-jbig2 = { version = "0.1", default-features = false, features = ["std"], optional = true }
|
|
192
207
|
whatlang = { version = "0.18.0", optional = true }
|
|
193
208
|
text-splitter = { version = "0.29.3", features = ["markdown"], optional = true }
|
|
194
209
|
unicode-normalization = { version = "0.1.25", optional = true }
|
|
@@ -218,7 +233,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
|
|
|
218
233
|
tempfile = { workspace = true }
|
|
219
234
|
filetime = "0.2"
|
|
220
235
|
tar = "0.4.44"
|
|
221
|
-
zip = "7.
|
|
236
|
+
zip = { version = "7.4.0", default-features = false, features = ["deflate-flate2"] }
|
|
222
237
|
serial_test = "3.3.1"
|
|
223
238
|
anyhow = { workspace = true }
|
|
224
239
|
tokio-test = "0.4"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.13 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -102,6 +102,15 @@ pub struct ExtractionConfig {
|
|
|
102
102
|
#[serde(default)]
|
|
103
103
|
pub result_format: crate::types::OutputFormat,
|
|
104
104
|
|
|
105
|
+
/// Security limits for archive extraction.
|
|
106
|
+
///
|
|
107
|
+
/// Controls maximum archive size, compression ratio, file count, and other
|
|
108
|
+
/// security thresholds to prevent decompression bomb attacks.
|
|
109
|
+
/// When `None`, default limits are used (500MB archive, 100:1 ratio, 10K files).
|
|
110
|
+
#[cfg(feature = "archives")]
|
|
111
|
+
#[serde(default)]
|
|
112
|
+
pub security_limits: Option<crate::extractors::security::SecurityLimits>,
|
|
113
|
+
|
|
105
114
|
/// Content text format (default: Plain).
|
|
106
115
|
///
|
|
107
116
|
/// Controls the format of the extracted content:
|
|
@@ -137,6 +146,8 @@ impl Default for ExtractionConfig {
|
|
|
137
146
|
#[cfg(feature = "html")]
|
|
138
147
|
html_options: None,
|
|
139
148
|
max_concurrent_extractions: None,
|
|
149
|
+
#[cfg(feature = "archives")]
|
|
150
|
+
security_limits: None,
|
|
140
151
|
result_format: crate::types::OutputFormat::Unified,
|
|
141
152
|
output_format: OutputFormat::Plain,
|
|
142
153
|
}
|
|
@@ -5,16 +5,16 @@
|
|
|
5
5
|
//! - Legacy format conversion (DOC, PPT)
|
|
6
6
|
//! - Extraction pipeline orchestration
|
|
7
7
|
|
|
8
|
-
#[cfg(not(feature = "office"))]
|
|
8
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
9
9
|
use crate::KreuzbergError;
|
|
10
10
|
use crate::Result;
|
|
11
11
|
use crate::core::config::ExtractionConfig;
|
|
12
12
|
use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
|
|
13
|
-
#[cfg(feature = "office")]
|
|
13
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
14
14
|
use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
|
|
15
15
|
use crate::types::ExtractionResult;
|
|
16
16
|
|
|
17
|
-
#[cfg(feature = "office")]
|
|
17
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
18
18
|
use super::file::apply_libreoffice_metadata;
|
|
19
19
|
use super::file::extract_bytes_with_extractor;
|
|
20
20
|
#[cfg(feature = "otel")]
|
|
@@ -72,7 +72,7 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
72
72
|
let validated_mime = mime::validate_mime_type(mime_type)?;
|
|
73
73
|
|
|
74
74
|
match validated_mime.as_str() {
|
|
75
|
-
#[cfg(feature = "office")]
|
|
75
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
76
76
|
LEGACY_WORD_MIME_TYPE => {
|
|
77
77
|
let conversion = convert_doc_to_docx(content).await?;
|
|
78
78
|
let mut result =
|
|
@@ -80,13 +80,13 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
80
80
|
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
81
81
|
return Ok(result);
|
|
82
82
|
}
|
|
83
|
-
#[cfg(not(feature = "office"))]
|
|
83
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
84
84
|
LEGACY_WORD_MIME_TYPE => {
|
|
85
85
|
return Err(KreuzbergError::UnsupportedFormat(
|
|
86
86
|
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
87
87
|
));
|
|
88
88
|
}
|
|
89
|
-
#[cfg(feature = "office")]
|
|
89
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
90
90
|
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
91
91
|
let conversion = convert_ppt_to_pptx(content).await?;
|
|
92
92
|
let mut result =
|
|
@@ -94,7 +94,7 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
94
94
|
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
95
95
|
return Ok(result);
|
|
96
96
|
}
|
|
97
|
-
#[cfg(not(feature = "office"))]
|
|
97
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
98
98
|
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
99
99
|
return Err(KreuzbergError::UnsupportedFormat(
|
|
100
100
|
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
@@ -6,23 +6,23 @@
|
|
|
6
6
|
//! - File validation and reading
|
|
7
7
|
//! - Extraction pipeline orchestration
|
|
8
8
|
|
|
9
|
-
#[cfg(any(feature = "otel", not(feature = "office")))]
|
|
9
|
+
#[cfg(any(feature = "otel", not(all(feature = "office", not(target_arch = "wasm32")))))]
|
|
10
10
|
use crate::KreuzbergError;
|
|
11
11
|
use crate::Result;
|
|
12
12
|
use crate::core::config::ExtractionConfig;
|
|
13
13
|
use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
|
|
14
|
-
#[cfg(feature = "office")]
|
|
14
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
15
15
|
use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
|
|
16
16
|
use crate::types::ExtractionResult;
|
|
17
|
-
#[cfg(feature = "office")]
|
|
17
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
18
18
|
use crate::types::LibreOfficeConversionResult;
|
|
19
|
-
#[cfg(feature = "office")]
|
|
19
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
20
20
|
use serde_json::json;
|
|
21
|
-
#[cfg(feature = "office")]
|
|
21
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
22
22
|
use std::borrow::Cow;
|
|
23
23
|
use std::path::Path;
|
|
24
24
|
|
|
25
|
-
#[cfg(feature = "office")]
|
|
25
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
26
26
|
use super::helpers::pool_mime_type;
|
|
27
27
|
|
|
28
28
|
use super::helpers::get_extractor;
|
|
@@ -151,7 +151,7 @@ pub async fn extract_file(
|
|
|
151
151
|
let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
|
|
152
152
|
|
|
153
153
|
match detected_mime.as_str() {
|
|
154
|
-
#[cfg(feature = "office")]
|
|
154
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
155
155
|
LEGACY_WORD_MIME_TYPE => {
|
|
156
156
|
let original_bytes = tokio::fs::read(path).await?;
|
|
157
157
|
let conversion = convert_doc_to_docx(&original_bytes).await?;
|
|
@@ -160,13 +160,13 @@ pub async fn extract_file(
|
|
|
160
160
|
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
161
161
|
return Ok(result);
|
|
162
162
|
}
|
|
163
|
-
#[cfg(not(feature = "office"))]
|
|
163
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
164
164
|
LEGACY_WORD_MIME_TYPE => {
|
|
165
165
|
return Err(KreuzbergError::UnsupportedFormat(
|
|
166
166
|
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
167
167
|
));
|
|
168
168
|
}
|
|
169
|
-
#[cfg(feature = "office")]
|
|
169
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
170
170
|
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
171
171
|
let original_bytes = tokio::fs::read(path).await?;
|
|
172
172
|
let conversion = convert_ppt_to_pptx(&original_bytes).await?;
|
|
@@ -175,7 +175,7 @@ pub async fn extract_file(
|
|
|
175
175
|
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
176
176
|
return Ok(result);
|
|
177
177
|
}
|
|
178
|
-
#[cfg(not(feature = "office"))]
|
|
178
|
+
#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
|
|
179
179
|
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
180
180
|
return Err(KreuzbergError::UnsupportedFormat(
|
|
181
181
|
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
@@ -222,7 +222,7 @@ pub(in crate::core::extractor) async fn extract_bytes_with_extractor(
|
|
|
222
222
|
Ok(result)
|
|
223
223
|
}
|
|
224
224
|
|
|
225
|
-
#[cfg(feature = "office")]
|
|
225
|
+
#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
|
|
226
226
|
pub(in crate::core::extractor) fn apply_libreoffice_metadata(
|
|
227
227
|
result: &mut ExtractionResult,
|
|
228
228
|
legacy_mime: &str,
|
|
@@ -80,6 +80,10 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
|
|
|
80
80
|
m.insert("jpx", "image/jpx");
|
|
81
81
|
m.insert("jpm", "image/jpm");
|
|
82
82
|
m.insert("mj2", "image/mj2");
|
|
83
|
+
m.insert("j2k", "image/jp2");
|
|
84
|
+
m.insert("j2c", "image/jp2");
|
|
85
|
+
m.insert("jbig2", "image/x-jbig2");
|
|
86
|
+
m.insert("jb2", "image/x-jbig2");
|
|
83
87
|
m.insert("pnm", "image/x-portable-anymap");
|
|
84
88
|
m.insert("pbm", "image/x-portable-bitmap");
|
|
85
89
|
m.insert("pgm", "image/x-portable-graymap");
|
|
@@ -108,10 +112,18 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
|
|
|
108
112
|
m.insert("epub", "application/epub+zip");
|
|
109
113
|
m.insert("rtf", "application/rtf");
|
|
110
114
|
m.insert("bib", "application/x-bibtex");
|
|
115
|
+
m.insert("ris", "application/x-research-info-systems");
|
|
116
|
+
m.insert("nbib", "application/x-pubmed");
|
|
117
|
+
m.insert("enw", "application/x-endnote+xml");
|
|
118
|
+
m.insert("fb2", "application/x-fictionbook+xml");
|
|
119
|
+
m.insert("opml", "application/xml+opml");
|
|
120
|
+
m.insert("dbk", "application/docbook+xml");
|
|
111
121
|
m.insert("ipynb", "application/x-ipynb+json");
|
|
112
122
|
m.insert("tex", "application/x-latex");
|
|
113
123
|
m.insert("latex", "application/x-latex");
|
|
114
124
|
m.insert("typst", "application/x-typst");
|
|
125
|
+
m.insert("typ", "application/x-typst");
|
|
126
|
+
m.insert("djot", "text/x-djot");
|
|
115
127
|
m.insert("commonmark", "text/x-commonmark");
|
|
116
128
|
|
|
117
129
|
m
|
|
@@ -137,6 +149,7 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
137
149
|
set.insert("image/tiff");
|
|
138
150
|
set.insert("image/webp");
|
|
139
151
|
set.insert("image/x-bmp");
|
|
152
|
+
set.insert("image/x-jbig2");
|
|
140
153
|
set.insert("image/x-ms-bmp");
|
|
141
154
|
set.insert("image/x-portable-anymap");
|
|
142
155
|
set.insert("image/x-portable-bitmap");
|
|
@@ -146,20 +159,25 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
146
159
|
|
|
147
160
|
set.insert("application/csl+json");
|
|
148
161
|
set.insert("application/docbook+xml");
|
|
162
|
+
set.insert("text/docbook");
|
|
149
163
|
set.insert("application/epub+zip");
|
|
150
164
|
set.insert("application/rtf");
|
|
151
165
|
set.insert("application/vnd.oasis.opendocument.text");
|
|
152
166
|
set.insert(DOCX_MIME_TYPE);
|
|
153
167
|
set.insert("application/x-biblatex");
|
|
154
168
|
set.insert("application/x-bibtex");
|
|
169
|
+
set.insert("text/x-bibtex");
|
|
155
170
|
set.insert("application/x-endnote+xml");
|
|
156
171
|
set.insert("application/x-fictionbook+xml");
|
|
172
|
+
set.insert("application/x-fictionbook");
|
|
173
|
+
set.insert("text/x-fictionbook");
|
|
157
174
|
set.insert("application/x-ipynb+json");
|
|
158
175
|
set.insert("application/x-jats+xml");
|
|
159
176
|
set.insert("application/x-latex");
|
|
160
177
|
set.insert("application/xml+opml");
|
|
161
178
|
set.insert("application/x-opml+xml");
|
|
162
179
|
set.insert("application/x-research-info-systems");
|
|
180
|
+
set.insert("application/x-pubmed");
|
|
163
181
|
set.insert("application/x-typst");
|
|
164
182
|
set.insert("text/csv");
|
|
165
183
|
set.insert("text/tab-separated-values");
|
|
@@ -210,8 +228,26 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
210
228
|
set.insert("application/tar");
|
|
211
229
|
set.insert("application/x-gtar");
|
|
212
230
|
set.insert("application/x-ustar");
|
|
231
|
+
set.insert("application/gzip");
|
|
232
|
+
set.insert("application/x-gzip");
|
|
213
233
|
set.insert("application/x-7z-compressed");
|
|
214
234
|
|
|
235
|
+
set.insert("text/djot");
|
|
236
|
+
set.insert("text/x-djot");
|
|
237
|
+
|
|
238
|
+
// Additional extractor-supported MIME types that must stay in sync
|
|
239
|
+
set.insert("text/jats");
|
|
240
|
+
set.insert("application/x-epub+zip");
|
|
241
|
+
set.insert("application/vnd.epub+zip");
|
|
242
|
+
set.insert("text/rtf");
|
|
243
|
+
set.insert("text/prs.fallenstein.rst");
|
|
244
|
+
set.insert("text/x-tex");
|
|
245
|
+
set.insert("text/org");
|
|
246
|
+
set.insert("application/x-org");
|
|
247
|
+
set.insert("application/xhtml+xml");
|
|
248
|
+
set.insert("text/x-typst");
|
|
249
|
+
set.insert("image/jpg");
|
|
250
|
+
|
|
215
251
|
set
|
|
216
252
|
});
|
|
217
253
|
|
|
@@ -291,6 +327,15 @@ pub fn validate_mime_type(mime_type: &str) -> Result<String> {
|
|
|
291
327
|
return Ok(mime_type.to_string());
|
|
292
328
|
}
|
|
293
329
|
|
|
330
|
+
// Case-insensitive fallback: MIME types are case-insensitive per RFC 2045.
|
|
331
|
+
// This handles common mismatches like "macroEnabled" vs "macroenabled".
|
|
332
|
+
let lower = mime_type.to_ascii_lowercase();
|
|
333
|
+
for supported in SUPPORTED_MIME_TYPES.iter() {
|
|
334
|
+
if supported.to_ascii_lowercase() == lower {
|
|
335
|
+
return Ok(supported.to_string());
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
294
339
|
Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
|
|
295
340
|
}
|
|
296
341
|
|
|
@@ -621,8 +666,8 @@ mod tests {
|
|
|
621
666
|
let file_path = dir.path().join("testfile");
|
|
622
667
|
File::create(&file_path).unwrap();
|
|
623
668
|
|
|
624
|
-
let
|
|
625
|
-
|
|
669
|
+
let _result = detect_mime_type(&file_path, true);
|
|
670
|
+
// Files without extensions may or may not be detected via mime_guess fallback
|
|
626
671
|
}
|
|
627
672
|
|
|
628
673
|
#[test]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
//! Gzip decompression and extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides functions for decompressing gzip files and extracting
|
|
4
|
+
//! metadata and text content from the compressed data.
|
|
5
|
+
|
|
6
|
+
use super::{ArchiveEntry, ArchiveMetadata};
|
|
7
|
+
use crate::error::{KreuzbergError, Result};
|
|
8
|
+
use crate::extractors::security::SecurityLimits;
|
|
9
|
+
use flate2::read::GzDecoder;
|
|
10
|
+
use std::collections::HashMap;
|
|
11
|
+
use std::io::Read;
|
|
12
|
+
|
|
13
|
+
/// Decompress gzip bytes with a size limit to prevent decompression bombs.
|
|
14
|
+
fn decompress_gzip_limited(bytes: &[u8], max_size: u64) -> Result<Vec<u8>> {
|
|
15
|
+
let decoder = GzDecoder::new(bytes);
|
|
16
|
+
let mut limited = decoder.take(max_size + 1);
|
|
17
|
+
let mut decompressed = Vec::new();
|
|
18
|
+
limited
|
|
19
|
+
.read_to_end(&mut decompressed)
|
|
20
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to decompress gzip: {}", e)))?;
|
|
21
|
+
|
|
22
|
+
if decompressed.len() as u64 > max_size {
|
|
23
|
+
return Err(KreuzbergError::validation(format!(
|
|
24
|
+
"Gzip decompressed size exceeds {} byte limit",
|
|
25
|
+
max_size
|
|
26
|
+
)));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
Ok(decompressed)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/// Decompress gzip bytes, returning the raw decompressed data.
|
|
33
|
+
pub fn decompress_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<Vec<u8>> {
|
|
34
|
+
decompress_gzip_limited(bytes, limits.max_archive_size as u64)
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/// Extract both metadata and text content from gzip in a single decompression pass.
|
|
38
|
+
///
|
|
39
|
+
/// This avoids the overhead of decompressing the data multiple times when both
|
|
40
|
+
/// metadata and text content are needed.
|
|
41
|
+
pub fn extract_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<(ArchiveMetadata, HashMap<String, String>)> {
|
|
42
|
+
let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
|
|
43
|
+
|
|
44
|
+
// Re-read header for filename (lightweight - no decompression)
|
|
45
|
+
let mut decoder = GzDecoder::new(bytes);
|
|
46
|
+
let mut _discard = [0u8; 1];
|
|
47
|
+
let _ = decoder.read(&mut _discard); // trigger header read
|
|
48
|
+
let filename = decoder
|
|
49
|
+
.header()
|
|
50
|
+
.and_then(|h| h.filename())
|
|
51
|
+
.and_then(|f| std::str::from_utf8(f).ok())
|
|
52
|
+
.unwrap_or("compressed_content")
|
|
53
|
+
.to_string();
|
|
54
|
+
|
|
55
|
+
let size = decompressed.len() as u64;
|
|
56
|
+
|
|
57
|
+
let metadata = ArchiveMetadata {
|
|
58
|
+
format: "GZIP".to_string(),
|
|
59
|
+
file_list: vec![ArchiveEntry {
|
|
60
|
+
path: filename.clone(),
|
|
61
|
+
size,
|
|
62
|
+
is_dir: false,
|
|
63
|
+
}],
|
|
64
|
+
file_count: 1,
|
|
65
|
+
total_size: size,
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
let mut contents = HashMap::new();
|
|
69
|
+
if let Ok(text) = String::from_utf8(decompressed) {
|
|
70
|
+
contents.insert(filename, text);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
Ok((metadata, contents))
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/// Extract metadata from a gzip-compressed file.
|
|
77
|
+
///
|
|
78
|
+
/// Gzip wraps a single stream, so the metadata contains one entry
|
|
79
|
+
/// with the original filename (from gzip header) and decompressed size.
|
|
80
|
+
pub fn extract_gzip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
|
|
81
|
+
let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
|
|
82
|
+
|
|
83
|
+
let mut decoder = GzDecoder::new(bytes);
|
|
84
|
+
let mut _discard = [0u8; 1];
|
|
85
|
+
let _ = decoder.read(&mut _discard);
|
|
86
|
+
let filename = decoder
|
|
87
|
+
.header()
|
|
88
|
+
.and_then(|h| h.filename())
|
|
89
|
+
.and_then(|f| std::str::from_utf8(f).ok())
|
|
90
|
+
.unwrap_or("compressed_content")
|
|
91
|
+
.to_string();
|
|
92
|
+
|
|
93
|
+
let size = decompressed.len() as u64;
|
|
94
|
+
|
|
95
|
+
Ok(ArchiveMetadata {
|
|
96
|
+
format: "GZIP".to_string(),
|
|
97
|
+
file_list: vec![ArchiveEntry {
|
|
98
|
+
path: filename,
|
|
99
|
+
size,
|
|
100
|
+
is_dir: false,
|
|
101
|
+
}],
|
|
102
|
+
file_count: 1,
|
|
103
|
+
total_size: size,
|
|
104
|
+
})
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/// Extract text content from a gzip-compressed file.
|
|
108
|
+
///
|
|
109
|
+
/// Decompresses and attempts to read the result as UTF-8 text.
|
|
110
|
+
pub fn extract_gzip_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
|
|
111
|
+
let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
|
|
112
|
+
|
|
113
|
+
let mut decoder = GzDecoder::new(bytes);
|
|
114
|
+
let mut _discard = [0u8; 1];
|
|
115
|
+
let _ = decoder.read(&mut _discard);
|
|
116
|
+
let filename = decoder
|
|
117
|
+
.header()
|
|
118
|
+
.and_then(|h| h.filename())
|
|
119
|
+
.and_then(|f| std::str::from_utf8(f).ok())
|
|
120
|
+
.unwrap_or("compressed_content")
|
|
121
|
+
.to_string();
|
|
122
|
+
|
|
123
|
+
let mut contents = HashMap::new();
|
|
124
|
+
if let Ok(text) = String::from_utf8(decompressed) {
|
|
125
|
+
contents.insert(filename, text);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
Ok(contents)
|
|
129
|
+
}
|