kreuzberg 4.2.12 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +2 -2
  7. data/vendor/kreuzberg/Cargo.toml +24 -7
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
  10. data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
  11. data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
  12. data/vendor/kreuzberg/src/core/mime.rs +47 -2
  13. data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
  14. data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
  15. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
  16. data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
  17. data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
  18. data/vendor/kreuzberg/src/extraction/image.rs +405 -18
  19. data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
  20. data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
  21. data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
  22. data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
  23. data/vendor/kreuzberg/src/extractors/image.rs +25 -0
  24. data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
  25. data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
  26. data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
  27. data/vendor/kreuzberg/src/extractors/security.rs +2 -1
  28. data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
  29. data/vendor/kreuzberg/src/extractors/text.rs +33 -4
  30. data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
  31. data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
  32. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  33. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 869101ec7a3d0814c2baed8606879024e94880ead0003d441a16199d25fd3a16
4
- data.tar.gz: 9852c4c51345f362095306f40d910c2e1d4ae19f385754d4c8d3a960123f96ee
3
+ metadata.gz: 3c2053b10256948a215ff0d3552894991e801497ac4b2480eca3c98bb645cc27
4
+ data.tar.gz: 324b6147e172ecedb2338fab1b14ce2022a8b9c2d6be7fd86ac0f862d81ef7ce
5
5
  SHA512:
6
- metadata.gz: 579be7645c2f406ce8e7c4cc85ed511edb7c5879bc8674fe9ec4eb4375cf240113968f0ce35747bd10fae52ff5849df4616d8c6db299db609696da24b2700fff
7
- data.tar.gz: adc5969c0480739fd57a6bdb832db435d64e16ee0b2827197f05fcf4f52a7e7c03affd704b4c1064df49391bb105c4166531b8c4ba96538906b38f863744c843
6
+ metadata.gz: 84a6636111d240c99eb17546f80c1df31117c700d78282c18a67a79aa613021d33988cbc1b00d5bc62bb2ffeef8c2a8f1759e137329de8f30af7f61b6db1a55b
7
+ data.tar.gz: 7628ecce3c6fb44c06a9546f2db696ae3486de35e0a05195cbea752bc6f78e573162e6305aec8c8ae0ca0fdbb6709a3e75752b822dbd8aed637eff9577c3e020
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.12)
4
+ kreuzberg (4.2.13)
5
5
  rb_sys (~> 0.9.119)
6
6
 
7
7
  GEM
@@ -209,7 +209,7 @@ CHECKSUMS
209
209
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
210
210
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
211
211
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
212
- kreuzberg (4.2.12)
212
+ kreuzberg (4.2.13)
213
213
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
214
214
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
215
215
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.12" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.13" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.2.12"
40
+ version = "4.2.13"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.12'
4
+ VERSION = '4.2.13'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.12"
6
+ version = "4.2.13"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -49,7 +49,7 @@ toml = "0.9.11"
49
49
  num_cpus = "1.17.0"
50
50
  once_cell = "1.21.3"
51
51
  html-to-markdown-rs = { version = "2.24.5", default-features = false }
52
- reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
52
+ reqwest = { version = "0.13.2", default-features = false, features = ["json", "rustls"] }
53
53
  image = { version = "0.25.9", default-features = false }
54
54
  lzma-rust2 = { version = "0.15.7" }
55
55
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.12"
3
+ version = "4.2.13"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -38,18 +38,18 @@ office = [
38
38
  "dep:quick-xml",
39
39
  "dep:pulldown-cmark",
40
40
  "dep:biblatex",
41
+ "dep:biblib",
41
42
  "dep:org",
42
43
  "dep:rtf-parser",
43
44
  "dep:rst_parser",
44
45
  "dep:fb2",
45
46
  "dep:typst-syntax",
46
47
  "html",
47
- "tokio-runtime",
48
48
  ]
49
49
  email = ["dep:mail-parser", "dep:msg_parser"]
50
50
  html = ["dep:html-to-markdown-rs"]
51
51
  xml = ["dep:quick-xml", "dep:roxmltree"]
52
- archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2"]
52
+ archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2", "dep:flate2"]
53
53
 
54
54
  ocr = [
55
55
  "dep:kreuzberg-tesseract",
@@ -58,6 +58,8 @@ ocr = [
58
58
  "dep:fast_image_resize",
59
59
  "dep:ndarray",
60
60
  "dep:kamadak-exif",
61
+ "dep:hayro-jpeg2000",
62
+ "dep:hayro-jbig2",
61
63
  "html",
62
64
  ]
63
65
  language-detection = ["dep:whatlang"]
@@ -76,7 +78,7 @@ mcp-http = ["mcp", "api"]
76
78
 
77
79
  otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
78
80
 
79
- wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality"]
81
+ wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality", "office"]
80
82
  wasm-threads = ["dep:wasm-bindgen-rayon"]
81
83
 
82
84
  full = [
@@ -126,7 +128,7 @@ simdutf8 = { version = "0.1", optional = true }
126
128
  hex = { workspace = true }
127
129
  lazy_static = "1.5.0"
128
130
  libc = { workspace = true }
129
- memchr = "2.7.6"
131
+ memchr = "2.8.0"
130
132
  num_cpus = { workspace = true }
131
133
  once_cell = { workspace = true }
132
134
  parking_lot = { workspace = true }
@@ -153,7 +155,9 @@ lopdf = { version = "0.39.0", optional = true }
153
155
  calamine = { version = "0.33.0", features = ["dates"], optional = true }
154
156
  polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
155
157
  roxmltree = { version = "0.21.1", optional = true }
156
- zip = { version = "7.4.0", optional = true }
158
+ zip = { version = "7.4.0", optional = true, default-features = false, features = [
159
+ "deflate-flate2",
160
+ ] }
157
161
  mail-parser = { version = "0.11.1", optional = true }
158
162
  msg_parser = { version = "0.1.1", optional = true }
159
163
  html-to-markdown-rs = { workspace = true, features = [
@@ -164,9 +168,16 @@ quick-xml = { version = "0.39.0", features = ["serialize"], optional = true }
164
168
  tar = { version = "0.4.44", optional = true }
165
169
  sevenz-rust2 = { version = "0.20.1", optional = true }
166
170
  lzma-rust2 = { workspace = true, optional = true }
171
+ flate2 = { version = "1.0", optional = true }
167
172
 
168
173
  pulldown-cmark = { version = "0.13", optional = true }
169
174
  biblatex = { version = "0.11", optional = true }
175
+ biblib = { version = "0.3", default-features = false, features = [
176
+ "ris",
177
+ "pubmed",
178
+ "xml",
179
+ "regex",
180
+ ], optional = true }
170
181
  org = { version = "0.3", optional = true }
171
182
  rtf-parser = { version = "0.4", optional = true }
172
183
  rst_parser = { version = "0.4", optional = true }
@@ -181,12 +192,18 @@ image = { workspace = true, default-features = false, features = [
181
192
  "bmp",
182
193
  "tiff",
183
194
  "gif",
195
+ "pnm",
184
196
  "rayon",
185
197
  ], optional = true }
186
198
  tiff = { version = "0.11", optional = true }
187
199
  fast_image_resize = { version = "6.0.0", optional = true }
188
200
  ndarray = { version = "0.17.2", optional = true }
189
201
  kamadak-exif = { version = "0.6.1", optional = true }
202
+ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
203
+ "std",
204
+ "simd",
205
+ ], optional = true }
206
+ hayro-jbig2 = { version = "0.1", default-features = false, features = ["std"], optional = true }
190
207
  whatlang = { version = "0.18.0", optional = true }
191
208
  text-splitter = { version = "0.29.3", features = ["markdown"], optional = true }
192
209
  unicode-normalization = { version = "0.1.25", optional = true }
@@ -216,7 +233,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
216
233
  tempfile = { workspace = true }
217
234
  filetime = "0.2"
218
235
  tar = "0.4.44"
219
- zip = "7.4.0"
236
+ zip = { version = "7.4.0", default-features = false, features = ["deflate-flate2"] }
220
237
  serial_test = "3.3.1"
221
238
  anyhow = { workspace = true }
222
239
  tokio-test = "0.4"
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.12 Release**
20
+ > **🚀 Version 4.2.13 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -102,6 +102,15 @@ pub struct ExtractionConfig {
102
102
  #[serde(default)]
103
103
  pub result_format: crate::types::OutputFormat,
104
104
 
105
+ /// Security limits for archive extraction.
106
+ ///
107
+ /// Controls maximum archive size, compression ratio, file count, and other
108
+ /// security thresholds to prevent decompression bomb attacks.
109
+ /// When `None`, default limits are used (500MB archive, 100:1 ratio, 10K files).
110
+ #[cfg(feature = "archives")]
111
+ #[serde(default)]
112
+ pub security_limits: Option<crate::extractors::security::SecurityLimits>,
113
+
105
114
  /// Content text format (default: Plain).
106
115
  ///
107
116
  /// Controls the format of the extracted content:
@@ -137,6 +146,8 @@ impl Default for ExtractionConfig {
137
146
  #[cfg(feature = "html")]
138
147
  html_options: None,
139
148
  max_concurrent_extractions: None,
149
+ #[cfg(feature = "archives")]
150
+ security_limits: None,
140
151
  result_format: crate::types::OutputFormat::Unified,
141
152
  output_format: OutputFormat::Plain,
142
153
  }
@@ -5,16 +5,16 @@
5
5
  //! - Legacy format conversion (DOC, PPT)
6
6
  //! - Extraction pipeline orchestration
7
7
 
8
- #[cfg(not(feature = "office"))]
8
+ #[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
9
9
  use crate::KreuzbergError;
10
10
  use crate::Result;
11
11
  use crate::core::config::ExtractionConfig;
12
12
  use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
13
- #[cfg(feature = "office")]
13
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
14
14
  use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
15
15
  use crate::types::ExtractionResult;
16
16
 
17
- #[cfg(feature = "office")]
17
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
18
18
  use super::file::apply_libreoffice_metadata;
19
19
  use super::file::extract_bytes_with_extractor;
20
20
  #[cfg(feature = "otel")]
@@ -72,7 +72,7 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
72
72
  let validated_mime = mime::validate_mime_type(mime_type)?;
73
73
 
74
74
  match validated_mime.as_str() {
75
- #[cfg(feature = "office")]
75
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
76
76
  LEGACY_WORD_MIME_TYPE => {
77
77
  let conversion = convert_doc_to_docx(content).await?;
78
78
  let mut result =
@@ -80,13 +80,13 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
80
80
  apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
81
81
  return Ok(result);
82
82
  }
83
- #[cfg(not(feature = "office"))]
83
+ #[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
84
84
  LEGACY_WORD_MIME_TYPE => {
85
85
  return Err(KreuzbergError::UnsupportedFormat(
86
86
  "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
87
87
  ));
88
88
  }
89
- #[cfg(feature = "office")]
89
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
90
90
  LEGACY_POWERPOINT_MIME_TYPE => {
91
91
  let conversion = convert_ppt_to_pptx(content).await?;
92
92
  let mut result =
@@ -94,7 +94,7 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
94
94
  apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
95
95
  return Ok(result);
96
96
  }
97
- #[cfg(not(feature = "office"))]
97
+ #[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
98
98
  LEGACY_POWERPOINT_MIME_TYPE => {
99
99
  return Err(KreuzbergError::UnsupportedFormat(
100
100
  "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
@@ -6,23 +6,23 @@
6
6
  //! - File validation and reading
7
7
  //! - Extraction pipeline orchestration
8
8
 
9
- #[cfg(any(feature = "otel", not(feature = "office")))]
9
+ #[cfg(any(feature = "otel", not(all(feature = "office", not(target_arch = "wasm32")))))]
10
10
  use crate::KreuzbergError;
11
11
  use crate::Result;
12
12
  use crate::core::config::ExtractionConfig;
13
13
  use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
14
- #[cfg(feature = "office")]
14
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
15
15
  use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
16
16
  use crate::types::ExtractionResult;
17
- #[cfg(feature = "office")]
17
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
18
18
  use crate::types::LibreOfficeConversionResult;
19
- #[cfg(feature = "office")]
19
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
20
20
  use serde_json::json;
21
- #[cfg(feature = "office")]
21
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
22
22
  use std::borrow::Cow;
23
23
  use std::path::Path;
24
24
 
25
- #[cfg(feature = "office")]
25
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
26
26
  use super::helpers::pool_mime_type;
27
27
 
28
28
  use super::helpers::get_extractor;
@@ -151,7 +151,7 @@ pub async fn extract_file(
151
151
  let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
152
152
 
153
153
  match detected_mime.as_str() {
154
- #[cfg(feature = "office")]
154
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
155
155
  LEGACY_WORD_MIME_TYPE => {
156
156
  let original_bytes = tokio::fs::read(path).await?;
157
157
  let conversion = convert_doc_to_docx(&original_bytes).await?;
@@ -160,13 +160,13 @@ pub async fn extract_file(
160
160
  apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
161
161
  return Ok(result);
162
162
  }
163
- #[cfg(not(feature = "office"))]
163
+ #[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
164
164
  LEGACY_WORD_MIME_TYPE => {
165
165
  return Err(KreuzbergError::UnsupportedFormat(
166
166
  "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
167
167
  ));
168
168
  }
169
- #[cfg(feature = "office")]
169
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
170
170
  LEGACY_POWERPOINT_MIME_TYPE => {
171
171
  let original_bytes = tokio::fs::read(path).await?;
172
172
  let conversion = convert_ppt_to_pptx(&original_bytes).await?;
@@ -175,7 +175,7 @@ pub async fn extract_file(
175
175
  apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
176
176
  return Ok(result);
177
177
  }
178
- #[cfg(not(feature = "office"))]
178
+ #[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
179
179
  LEGACY_POWERPOINT_MIME_TYPE => {
180
180
  return Err(KreuzbergError::UnsupportedFormat(
181
181
  "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
@@ -222,7 +222,7 @@ pub(in crate::core::extractor) async fn extract_bytes_with_extractor(
222
222
  Ok(result)
223
223
  }
224
224
 
225
- #[cfg(feature = "office")]
225
+ #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
226
226
  pub(in crate::core::extractor) fn apply_libreoffice_metadata(
227
227
  result: &mut ExtractionResult,
228
228
  legacy_mime: &str,
@@ -80,6 +80,10 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
80
80
  m.insert("jpx", "image/jpx");
81
81
  m.insert("jpm", "image/jpm");
82
82
  m.insert("mj2", "image/mj2");
83
+ m.insert("j2k", "image/jp2");
84
+ m.insert("j2c", "image/jp2");
85
+ m.insert("jbig2", "image/x-jbig2");
86
+ m.insert("jb2", "image/x-jbig2");
83
87
  m.insert("pnm", "image/x-portable-anymap");
84
88
  m.insert("pbm", "image/x-portable-bitmap");
85
89
  m.insert("pgm", "image/x-portable-graymap");
@@ -108,10 +112,18 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
108
112
  m.insert("epub", "application/epub+zip");
109
113
  m.insert("rtf", "application/rtf");
110
114
  m.insert("bib", "application/x-bibtex");
115
+ m.insert("ris", "application/x-research-info-systems");
116
+ m.insert("nbib", "application/x-pubmed");
117
+ m.insert("enw", "application/x-endnote+xml");
118
+ m.insert("fb2", "application/x-fictionbook+xml");
119
+ m.insert("opml", "application/xml+opml");
120
+ m.insert("dbk", "application/docbook+xml");
111
121
  m.insert("ipynb", "application/x-ipynb+json");
112
122
  m.insert("tex", "application/x-latex");
113
123
  m.insert("latex", "application/x-latex");
114
124
  m.insert("typst", "application/x-typst");
125
+ m.insert("typ", "application/x-typst");
126
+ m.insert("djot", "text/x-djot");
115
127
  m.insert("commonmark", "text/x-commonmark");
116
128
 
117
129
  m
@@ -137,6 +149,7 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
137
149
  set.insert("image/tiff");
138
150
  set.insert("image/webp");
139
151
  set.insert("image/x-bmp");
152
+ set.insert("image/x-jbig2");
140
153
  set.insert("image/x-ms-bmp");
141
154
  set.insert("image/x-portable-anymap");
142
155
  set.insert("image/x-portable-bitmap");
@@ -146,20 +159,25 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
146
159
 
147
160
  set.insert("application/csl+json");
148
161
  set.insert("application/docbook+xml");
162
+ set.insert("text/docbook");
149
163
  set.insert("application/epub+zip");
150
164
  set.insert("application/rtf");
151
165
  set.insert("application/vnd.oasis.opendocument.text");
152
166
  set.insert(DOCX_MIME_TYPE);
153
167
  set.insert("application/x-biblatex");
154
168
  set.insert("application/x-bibtex");
169
+ set.insert("text/x-bibtex");
155
170
  set.insert("application/x-endnote+xml");
156
171
  set.insert("application/x-fictionbook+xml");
172
+ set.insert("application/x-fictionbook");
173
+ set.insert("text/x-fictionbook");
157
174
  set.insert("application/x-ipynb+json");
158
175
  set.insert("application/x-jats+xml");
159
176
  set.insert("application/x-latex");
160
177
  set.insert("application/xml+opml");
161
178
  set.insert("application/x-opml+xml");
162
179
  set.insert("application/x-research-info-systems");
180
+ set.insert("application/x-pubmed");
163
181
  set.insert("application/x-typst");
164
182
  set.insert("text/csv");
165
183
  set.insert("text/tab-separated-values");
@@ -210,8 +228,26 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
210
228
  set.insert("application/tar");
211
229
  set.insert("application/x-gtar");
212
230
  set.insert("application/x-ustar");
231
+ set.insert("application/gzip");
232
+ set.insert("application/x-gzip");
213
233
  set.insert("application/x-7z-compressed");
214
234
 
235
+ set.insert("text/djot");
236
+ set.insert("text/x-djot");
237
+
238
+ // Additional extractor-supported MIME types that must stay in sync
239
+ set.insert("text/jats");
240
+ set.insert("application/x-epub+zip");
241
+ set.insert("application/vnd.epub+zip");
242
+ set.insert("text/rtf");
243
+ set.insert("text/prs.fallenstein.rst");
244
+ set.insert("text/x-tex");
245
+ set.insert("text/org");
246
+ set.insert("application/x-org");
247
+ set.insert("application/xhtml+xml");
248
+ set.insert("text/x-typst");
249
+ set.insert("image/jpg");
250
+
215
251
  set
216
252
  });
217
253
 
@@ -291,6 +327,15 @@ pub fn validate_mime_type(mime_type: &str) -> Result<String> {
291
327
  return Ok(mime_type.to_string());
292
328
  }
293
329
 
330
+ // Case-insensitive fallback: MIME types are case-insensitive per RFC 2045.
331
+ // This handles common mismatches like "macroEnabled" vs "macroenabled".
332
+ let lower = mime_type.to_ascii_lowercase();
333
+ for supported in SUPPORTED_MIME_TYPES.iter() {
334
+ if supported.to_ascii_lowercase() == lower {
335
+ return Ok(supported.to_string());
336
+ }
337
+ }
338
+
294
339
  Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
295
340
  }
296
341
 
@@ -621,8 +666,8 @@ mod tests {
621
666
  let file_path = dir.path().join("testfile");
622
667
  File::create(&file_path).unwrap();
623
668
 
624
- let result = detect_mime_type(&file_path, true);
625
- assert!(result.is_err() || result.is_ok());
669
+ let _result = detect_mime_type(&file_path, true);
670
+ // Files without extensions may or may not be detected via mime_guess fallback
626
671
  }
627
672
 
628
673
  #[test]
@@ -0,0 +1,129 @@
1
+ //! Gzip decompression and extraction.
2
+ //!
3
+ //! Provides functions for decompressing gzip files and extracting
4
+ //! metadata and text content from the compressed data.
5
+
6
+ use super::{ArchiveEntry, ArchiveMetadata};
7
+ use crate::error::{KreuzbergError, Result};
8
+ use crate::extractors::security::SecurityLimits;
9
+ use flate2::read::GzDecoder;
10
+ use std::collections::HashMap;
11
+ use std::io::Read;
12
+
13
+ /// Decompress gzip bytes with a size limit to prevent decompression bombs.
14
+ fn decompress_gzip_limited(bytes: &[u8], max_size: u64) -> Result<Vec<u8>> {
15
+ let decoder = GzDecoder::new(bytes);
16
+ let mut limited = decoder.take(max_size + 1);
17
+ let mut decompressed = Vec::new();
18
+ limited
19
+ .read_to_end(&mut decompressed)
20
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to decompress gzip: {}", e)))?;
21
+
22
+ if decompressed.len() as u64 > max_size {
23
+ return Err(KreuzbergError::validation(format!(
24
+ "Gzip decompressed size exceeds {} byte limit",
25
+ max_size
26
+ )));
27
+ }
28
+
29
+ Ok(decompressed)
30
+ }
31
+
32
+ /// Decompress gzip bytes, returning the raw decompressed data.
33
+ pub fn decompress_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<Vec<u8>> {
34
+ decompress_gzip_limited(bytes, limits.max_archive_size as u64)
35
+ }
36
+
37
+ /// Extract both metadata and text content from gzip in a single decompression pass.
38
+ ///
39
+ /// This avoids the overhead of decompressing the data multiple times when both
40
+ /// metadata and text content are needed.
41
+ pub fn extract_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<(ArchiveMetadata, HashMap<String, String>)> {
42
+ let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
43
+
44
+ // Re-read header for filename (lightweight - no decompression)
45
+ let mut decoder = GzDecoder::new(bytes);
46
+ let mut _discard = [0u8; 1];
47
+ let _ = decoder.read(&mut _discard); // trigger header read
48
+ let filename = decoder
49
+ .header()
50
+ .and_then(|h| h.filename())
51
+ .and_then(|f| std::str::from_utf8(f).ok())
52
+ .unwrap_or("compressed_content")
53
+ .to_string();
54
+
55
+ let size = decompressed.len() as u64;
56
+
57
+ let metadata = ArchiveMetadata {
58
+ format: "GZIP".to_string(),
59
+ file_list: vec![ArchiveEntry {
60
+ path: filename.clone(),
61
+ size,
62
+ is_dir: false,
63
+ }],
64
+ file_count: 1,
65
+ total_size: size,
66
+ };
67
+
68
+ let mut contents = HashMap::new();
69
+ if let Ok(text) = String::from_utf8(decompressed) {
70
+ contents.insert(filename, text);
71
+ }
72
+
73
+ Ok((metadata, contents))
74
+ }
75
+
76
+ /// Extract metadata from a gzip-compressed file.
77
+ ///
78
+ /// Gzip wraps a single stream, so the metadata contains one entry
79
+ /// with the original filename (from gzip header) and decompressed size.
80
+ pub fn extract_gzip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
81
+ let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
82
+
83
+ let mut decoder = GzDecoder::new(bytes);
84
+ let mut _discard = [0u8; 1];
85
+ let _ = decoder.read(&mut _discard);
86
+ let filename = decoder
87
+ .header()
88
+ .and_then(|h| h.filename())
89
+ .and_then(|f| std::str::from_utf8(f).ok())
90
+ .unwrap_or("compressed_content")
91
+ .to_string();
92
+
93
+ let size = decompressed.len() as u64;
94
+
95
+ Ok(ArchiveMetadata {
96
+ format: "GZIP".to_string(),
97
+ file_list: vec![ArchiveEntry {
98
+ path: filename,
99
+ size,
100
+ is_dir: false,
101
+ }],
102
+ file_count: 1,
103
+ total_size: size,
104
+ })
105
+ }
106
+
107
+ /// Extract text content from a gzip-compressed file.
108
+ ///
109
+ /// Decompresses and attempts to read the result as UTF-8 text.
110
+ pub fn extract_gzip_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
111
+ let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
112
+
113
+ let mut decoder = GzDecoder::new(bytes);
114
+ let mut _discard = [0u8; 1];
115
+ let _ = decoder.read(&mut _discard);
116
+ let filename = decoder
117
+ .header()
118
+ .and_then(|h| h.filename())
119
+ .and_then(|f| std::str::from_utf8(f).ok())
120
+ .unwrap_or("compressed_content")
121
+ .to_string();
122
+
123
+ let mut contents = HashMap::new();
124
+ if let Ok(text) = String::from_utf8(decompressed) {
125
+ contents.insert(filename, text);
126
+ }
127
+
128
+ Ok(contents)
129
+ }