RubyGems - kreuzberg - Versions diffs - 4.2.11 → 4.2.13 - Mend

kreuzberg 4.2.11 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +2 -2
data/vendor/kreuzberg/Cargo.toml +24 -9
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
data/vendor/kreuzberg/src/core/mime.rs +47 -2
data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
data/vendor/kreuzberg/src/extraction/{docx.rs → docx/mod.rs} +7 -17
data/vendor/kreuzberg/src/extraction/docx/parser.rs +686 -0
data/vendor/kreuzberg/src/extraction/image.rs +405 -18
data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
data/vendor/kreuzberg/src/extractors/docx.rs +10 -22
data/vendor/kreuzberg/src/extractors/image.rs +25 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
data/vendor/kreuzberg/src/extractors/security.rs +2 -1
data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
data/vendor/kreuzberg/src/extractors/text.rs +33 -4
data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
data/vendor/kreuzberg/tests/issue_359_list_whitespace_test.rs +33 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
metadata +7 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e8a57a9be868cea7df0628ac25c7f0aabea6ced5368b0ec69452abd8ace56cd9
-  data.tar.gz: dde627c6aff4ae060d53e5b44145bdaf6cfc8af870294f7521c200cf2b7e10ba
+  metadata.gz: 3c2053b10256948a215ff0d3552894991e801497ac4b2480eca3c98bb645cc27
+  data.tar.gz: 324b6147e172ecedb2338fab1b14ce2022a8b9c2d6be7fd86ac0f862d81ef7ce
 SHA512:
-  metadata.gz: 1c322dfecd4829e4e3aa13bbdd298f3f06f62877362867cc5795cc0690ef6b632ecbbac515f20c288ae3457ab5a022cb116b4922abc69a11e06151774f6f91f0
-  data.tar.gz: 50e8a2b5489f169afb9f6b60150954463ed31988b049607c86ac501051df8138e21eec3f04d1e7b8a68f00079078416f7b3a5c2e7d93bf6bfb2bb8313c8e8aa3
+  metadata.gz: 84a6636111d240c99eb17546f80c1df31117c700d78282c18a67a79aa613021d33988cbc1b00d5bc62bb2ffeef8c2a8f1759e137329de8f30af7f61b6db1a55b
+  data.tar.gz: 7628ecce3c6fb44c06a9546f2db696ae3486de35e0a05195cbea752bc6f78e573162e6305aec8c8ae0ca0fdbb6709a3e75752b822dbd8aed637eff9577c3e020

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    kreuzberg (4.2.11)
+    kreuzberg (4.2.13)
       rb_sys (~> 0.9.119)
 GEM
@@ -123,7 +123,7 @@ GEM
       rubocop (~> 1.81)
     ruby-progressbar (1.13.0)
     securerandom (0.4.1)
-    sorbet-runtime (0.6.12915)
+    sorbet-runtime (0.6.12925)
     steep (1.10.0)
       activesupport (>= 5.1)
       concurrent-ruby (>= 1.1.10)
@@ -209,7 +209,7 @@ CHECKSUMS
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
   json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
-  kreuzberg (4.2.11)
+  kreuzberg (4.2.13)
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -244,7 +244,7 @@ CHECKSUMS
   rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
   ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
   securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
-  sorbet-runtime (0.6.12915) sha256=21d2866b1edfe57c97d22f36db5bcf2db311f84290e56152e9faf4b4915aa315
+  sorbet-runtime (0.6.12925) sha256=ddd6fb1d8aaf6bc19119ffadbc4b96536f3d6766fa82059112dacb90977c6eca
   steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
   strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
   terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.11" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.13" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -37,7 +37,7 @@ collapsible_if = "allow"
 [package]
 name = "kreuzberg-rb"
-version = "4.2.11"
+version = "4.2.13"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.2.11'
+  VERSION = '4.2.13'
 end

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
 resolver = "2"
 [workspace.package]
-version = "4.2.11"
+version = "4.2.13"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -49,7 +49,7 @@ toml = "0.9.11"
 num_cpus = "1.17.0"
 once_cell = "1.21.3"
 html-to-markdown-rs = { version = "2.24.5", default-features = false }
-reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
+reqwest = { version = "0.13.2", default-features = false, features = ["json", "rustls"] }
 image = { version = "0.25.9", default-features = false }
 lzma-rust2 = { version = "0.15.7" }

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.2.11"
+version = "4.2.13"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -35,22 +35,21 @@ excel = ["dep:calamine", "dep:polars", "tokio-runtime"]
 office = [
     "dep:roxmltree",
     "dep:zip",
-    "dep:docx-lite",
     "dep:quick-xml",
     "dep:pulldown-cmark",
     "dep:biblatex",
+    "dep:biblib",
     "dep:org",
     "dep:rtf-parser",
     "dep:rst_parser",
     "dep:fb2",
     "dep:typst-syntax",
     "html",
-    "tokio-runtime",
 ]
 email = ["dep:mail-parser", "dep:msg_parser"]
 html = ["dep:html-to-markdown-rs"]
 xml = ["dep:quick-xml", "dep:roxmltree"]
-archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2"]
+archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2", "dep:flate2"]
 ocr = [
     "dep:kreuzberg-tesseract",
@@ -59,6 +58,8 @@ ocr = [
     "dep:fast_image_resize",
     "dep:ndarray",
     "dep:kamadak-exif",
+    "dep:hayro-jpeg2000",
+    "dep:hayro-jbig2",
     "html",
 ]
 language-detection = ["dep:whatlang"]
@@ -77,7 +78,7 @@ mcp-http = ["mcp", "api"]
 otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
-wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality"]
+wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality", "office"]
 wasm-threads = ["dep:wasm-bindgen-rayon"]
 full = [
@@ -127,7 +128,7 @@ simdutf8 = { version = "0.1", optional = true }
 hex = { workspace = true }
 lazy_static = "1.5.0"
 libc = { workspace = true }
-memchr = "2.7.6"
+memchr = "2.8.0"
 num_cpus = { workspace = true }
 once_cell = { workspace = true }
 parking_lot = { workspace = true }
@@ -154,7 +155,9 @@ lopdf = { version = "0.39.0", optional = true }
 calamine = { version = "0.33.0", features = ["dates"], optional = true }
 polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
 roxmltree = { version = "0.21.1", optional = true }
-zip = { version = "7.3.0", optional = true }
+zip = { version = "7.4.0", optional = true, default-features = false, features = [
+    "deflate-flate2",
+] }
 mail-parser = { version = "0.11.1", optional = true }
 msg_parser = { version = "0.1.1", optional = true }
 html-to-markdown-rs = { workspace = true, features = [
@@ -165,10 +168,16 @@ quick-xml = { version = "0.39.0", features = ["serialize"], optional = true }
 tar = { version = "0.4.44", optional = true }
 sevenz-rust2 = { version = "0.20.1", optional = true }
 lzma-rust2 = { workspace = true, optional = true }
-docx-lite = { version = "0.2.0", optional = true }
+flate2 = { version = "1.0", optional = true }
 pulldown-cmark = { version = "0.13", optional = true }
 biblatex = { version = "0.11", optional = true }
+biblib = { version = "0.3", default-features = false, features = [
+    "ris",
+    "pubmed",
+    "xml",
+    "regex",
+], optional = true }
 org = { version = "0.3", optional = true }
 rtf-parser = { version = "0.4", optional = true }
 rst_parser = { version = "0.4", optional = true }
@@ -183,12 +192,18 @@ image = { workspace = true, default-features = false, features = [
     "bmp",
     "tiff",
     "gif",
+    "pnm",
     "rayon",
 ], optional = true }
 tiff = { version = "0.11", optional = true }
 fast_image_resize = { version = "6.0.0", optional = true }
 ndarray = { version = "0.17.2", optional = true }
 kamadak-exif = { version = "0.6.1", optional = true }
+hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
+    "std",
+    "simd",
+], optional = true }
+hayro-jbig2 = { version = "0.1", default-features = false, features = ["std"], optional = true }
 whatlang = { version = "0.18.0", optional = true }
 text-splitter = { version = "0.29.3", features = ["markdown"], optional = true }
 unicode-normalization = { version = "0.1.25", optional = true }
@@ -218,7 +233,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
 tempfile = { workspace = true }
 filetime = "0.2"
 tar = "0.4.44"
-zip = "7.3.0"
+zip = { version = "7.4.0", default-features = false, features = ["deflate-flate2"] }
 serial_test = "3.3.1"
 anyhow = { workspace = true }
 tokio-test = "0.4"

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.2.11 Release**
+> **🚀 Version 4.2.13 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/core/config/extraction/core.rs CHANGED Viewed

@@ -102,6 +102,15 @@ pub struct ExtractionConfig {
     #[serde(default)]
     pub result_format: crate::types::OutputFormat,
+    /// Security limits for archive extraction.
+    ///
+    /// Controls maximum archive size, compression ratio, file count, and other
+    /// security thresholds to prevent decompression bomb attacks.
+    /// When `None`, default limits are used (500MB archive, 100:1 ratio, 10K files).
+    #[cfg(feature = "archives")]
+    #[serde(default)]
+    pub security_limits: Option<crate::extractors::security::SecurityLimits>,
     /// Content text format (default: Plain).
     ///
     /// Controls the format of the extracted content:
@@ -137,6 +146,8 @@ impl Default for ExtractionConfig {
             #[cfg(feature = "html")]
             html_options: None,
             max_concurrent_extractions: None,
+            #[cfg(feature = "archives")]
+            security_limits: None,
             result_format: crate::types::OutputFormat::Unified,
             output_format: OutputFormat::Plain,
         }

data/vendor/kreuzberg/src/core/extractor/bytes.rs CHANGED Viewed

@@ -5,16 +5,16 @@
 //! - Legacy format conversion (DOC, PPT)
 //! - Extraction pipeline orchestration
-#[cfg(not(feature = "office"))]
+#[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
 use crate::KreuzbergError;
 use crate::Result;
 use crate::core::config::ExtractionConfig;
 use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
-#[cfg(feature = "office")]
+#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
 use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
 use crate::types::ExtractionResult;
-#[cfg(feature = "office")]
+#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
 use super::file::apply_libreoffice_metadata;
 use super::file::extract_bytes_with_extractor;
 #[cfg(feature = "otel")]
@@ -72,7 +72,7 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
         let validated_mime = mime::validate_mime_type(mime_type)?;
         match validated_mime.as_str() {
-            #[cfg(feature = "office")]
+            #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
             LEGACY_WORD_MIME_TYPE => {
                 let conversion = convert_doc_to_docx(content).await?;
                 let mut result =
@@ -80,13 +80,13 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
                 apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
                 return Ok(result);
             }
-            #[cfg(not(feature = "office"))]
+            #[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
             LEGACY_WORD_MIME_TYPE => {
                 return Err(KreuzbergError::UnsupportedFormat(
                     "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
                 ));
             }
-            #[cfg(feature = "office")]
+            #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
             LEGACY_POWERPOINT_MIME_TYPE => {
                 let conversion = convert_ppt_to_pptx(content).await?;
                 let mut result =
@@ -94,7 +94,7 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
                 apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
                 return Ok(result);
             }
-            #[cfg(not(feature = "office"))]
+            #[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
             LEGACY_POWERPOINT_MIME_TYPE => {
                 return Err(KreuzbergError::UnsupportedFormat(
                     "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),

data/vendor/kreuzberg/src/core/extractor/file.rs CHANGED Viewed

@@ -6,23 +6,23 @@
 //! - File validation and reading
 //! - Extraction pipeline orchestration
-#[cfg(any(feature = "otel", not(feature = "office")))]
+#[cfg(any(feature = "otel", not(all(feature = "office", not(target_arch = "wasm32")))))]
 use crate::KreuzbergError;
 use crate::Result;
 use crate::core::config::ExtractionConfig;
 use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
-#[cfg(feature = "office")]
+#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
 use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
 use crate::types::ExtractionResult;
-#[cfg(feature = "office")]
+#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
 use crate::types::LibreOfficeConversionResult;
-#[cfg(feature = "office")]
+#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
 use serde_json::json;
-#[cfg(feature = "office")]
+#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
 use std::borrow::Cow;
 use std::path::Path;
-#[cfg(feature = "office")]
+#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
 use super::helpers::pool_mime_type;
 use super::helpers::get_extractor;
@@ -151,7 +151,7 @@ pub async fn extract_file(
         let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
         match detected_mime.as_str() {
-            #[cfg(feature = "office")]
+            #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
             LEGACY_WORD_MIME_TYPE => {
                 let original_bytes = tokio::fs::read(path).await?;
                 let conversion = convert_doc_to_docx(&original_bytes).await?;
@@ -160,13 +160,13 @@ pub async fn extract_file(
                 apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
                 return Ok(result);
             }
-            #[cfg(not(feature = "office"))]
+            #[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
             LEGACY_WORD_MIME_TYPE => {
                 return Err(KreuzbergError::UnsupportedFormat(
                     "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
                 ));
             }
-            #[cfg(feature = "office")]
+            #[cfg(all(feature = "office", not(target_arch = "wasm32")))]
             LEGACY_POWERPOINT_MIME_TYPE => {
                 let original_bytes = tokio::fs::read(path).await?;
                 let conversion = convert_ppt_to_pptx(&original_bytes).await?;
@@ -175,7 +175,7 @@ pub async fn extract_file(
                 apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
                 return Ok(result);
             }
-            #[cfg(not(feature = "office"))]
+            #[cfg(not(all(feature = "office", not(target_arch = "wasm32"))))]
             LEGACY_POWERPOINT_MIME_TYPE => {
                 return Err(KreuzbergError::UnsupportedFormat(
                     "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
@@ -222,7 +222,7 @@ pub(in crate::core::extractor) async fn extract_bytes_with_extractor(
     Ok(result)
 }
-#[cfg(feature = "office")]
+#[cfg(all(feature = "office", not(target_arch = "wasm32")))]
 pub(in crate::core::extractor) fn apply_libreoffice_metadata(
     result: &mut ExtractionResult,
     legacy_mime: &str,

data/vendor/kreuzberg/src/core/mime.rs CHANGED Viewed

@@ -80,6 +80,10 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
     m.insert("jpx", "image/jpx");
     m.insert("jpm", "image/jpm");
     m.insert("mj2", "image/mj2");
+    m.insert("j2k", "image/jp2");
+    m.insert("j2c", "image/jp2");
+    m.insert("jbig2", "image/x-jbig2");
+    m.insert("jb2", "image/x-jbig2");
     m.insert("pnm", "image/x-portable-anymap");
     m.insert("pbm", "image/x-portable-bitmap");
     m.insert("pgm", "image/x-portable-graymap");
@@ -108,10 +112,18 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
     m.insert("epub", "application/epub+zip");
     m.insert("rtf", "application/rtf");
     m.insert("bib", "application/x-bibtex");
+    m.insert("ris", "application/x-research-info-systems");
+    m.insert("nbib", "application/x-pubmed");
+    m.insert("enw", "application/x-endnote+xml");
+    m.insert("fb2", "application/x-fictionbook+xml");
+    m.insert("opml", "application/xml+opml");
+    m.insert("dbk", "application/docbook+xml");
     m.insert("ipynb", "application/x-ipynb+json");
     m.insert("tex", "application/x-latex");
     m.insert("latex", "application/x-latex");
     m.insert("typst", "application/x-typst");
+    m.insert("typ", "application/x-typst");
+    m.insert("djot", "text/x-djot");
     m.insert("commonmark", "text/x-commonmark");
     m
@@ -137,6 +149,7 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
     set.insert("image/tiff");
     set.insert("image/webp");
     set.insert("image/x-bmp");
+    set.insert("image/x-jbig2");
     set.insert("image/x-ms-bmp");
     set.insert("image/x-portable-anymap");
     set.insert("image/x-portable-bitmap");
@@ -146,20 +159,25 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
     set.insert("application/csl+json");
     set.insert("application/docbook+xml");
+    set.insert("text/docbook");
     set.insert("application/epub+zip");
     set.insert("application/rtf");
     set.insert("application/vnd.oasis.opendocument.text");
     set.insert(DOCX_MIME_TYPE);
     set.insert("application/x-biblatex");
     set.insert("application/x-bibtex");
+    set.insert("text/x-bibtex");
     set.insert("application/x-endnote+xml");
     set.insert("application/x-fictionbook+xml");
+    set.insert("application/x-fictionbook");
+    set.insert("text/x-fictionbook");
     set.insert("application/x-ipynb+json");
     set.insert("application/x-jats+xml");
     set.insert("application/x-latex");
     set.insert("application/xml+opml");
     set.insert("application/x-opml+xml");
     set.insert("application/x-research-info-systems");
+    set.insert("application/x-pubmed");
     set.insert("application/x-typst");
     set.insert("text/csv");
     set.insert("text/tab-separated-values");
@@ -210,8 +228,26 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
     set.insert("application/tar");
     set.insert("application/x-gtar");
     set.insert("application/x-ustar");
+    set.insert("application/gzip");
+    set.insert("application/x-gzip");
     set.insert("application/x-7z-compressed");
+    set.insert("text/djot");
+    set.insert("text/x-djot");
+    // Additional extractor-supported MIME types that must stay in sync
+    set.insert("text/jats");
+    set.insert("application/x-epub+zip");
+    set.insert("application/vnd.epub+zip");
+    set.insert("text/rtf");
+    set.insert("text/prs.fallenstein.rst");
+    set.insert("text/x-tex");
+    set.insert("text/org");
+    set.insert("application/x-org");
+    set.insert("application/xhtml+xml");
+    set.insert("text/x-typst");
+    set.insert("image/jpg");
     set
 });
@@ -291,6 +327,15 @@ pub fn validate_mime_type(mime_type: &str) -> Result<String> {
         return Ok(mime_type.to_string());
     }
+    // Case-insensitive fallback: MIME types are case-insensitive per RFC 2045.
+    // This handles common mismatches like "macroEnabled" vs "macroenabled".
+    let lower = mime_type.to_ascii_lowercase();
+    for supported in SUPPORTED_MIME_TYPES.iter() {
+        if supported.to_ascii_lowercase() == lower {
+            return Ok(supported.to_string());
+        }
+    }
     Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
 }
@@ -621,8 +666,8 @@ mod tests {
         let file_path = dir.path().join("testfile");
         File::create(&file_path).unwrap();
-        let result = detect_mime_type(&file_path, true);
-        assert!(result.is_err() || result.is_ok());
+        let _result = detect_mime_type(&file_path, true);
+        // Files without extensions may or may not be detected via mime_guess fallback
     }
     #[test]

data/vendor/kreuzberg/src/extraction/archive/gzip.rs ADDED Viewed

@@ -0,0 +1,129 @@
+//! Gzip decompression and extraction.
+//!
+//! Provides functions for decompressing gzip files and extracting
+//! metadata and text content from the compressed data.
+use super::{ArchiveEntry, ArchiveMetadata};
+use crate::error::{KreuzbergError, Result};
+use crate::extractors::security::SecurityLimits;
+use flate2::read::GzDecoder;
+use std::collections::HashMap;
+use std::io::Read;
+/// Decompress gzip bytes with a size limit to prevent decompression bombs.
+fn decompress_gzip_limited(bytes: &[u8], max_size: u64) -> Result<Vec<u8>> {
+    let decoder = GzDecoder::new(bytes);
+    let mut limited = decoder.take(max_size + 1);
+    let mut decompressed = Vec::new();
+    limited
+        .read_to_end(&mut decompressed)
+        .map_err(|e| KreuzbergError::parsing(format!("Failed to decompress gzip: {}", e)))?;
+    if decompressed.len() as u64 > max_size {
+        return Err(KreuzbergError::validation(format!(
+            "Gzip decompressed size exceeds {} byte limit",
+            max_size
+        )));
+    }
+    Ok(decompressed)
+}
+/// Decompress gzip bytes, returning the raw decompressed data.
+pub fn decompress_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<Vec<u8>> {
+    decompress_gzip_limited(bytes, limits.max_archive_size as u64)
+}
+/// Extract both metadata and text content from gzip in a single decompression pass.
+///
+/// This avoids the overhead of decompressing the data multiple times when both
+/// metadata and text content are needed.
+pub fn extract_gzip(bytes: &[u8], limits: &SecurityLimits) -> Result<(ArchiveMetadata, HashMap<String, String>)> {
+    let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
+    // Re-read header for filename (lightweight - no decompression)
+    let mut decoder = GzDecoder::new(bytes);
+    let mut _discard = [0u8; 1];
+    let _ = decoder.read(&mut _discard); // trigger header read
+    let filename = decoder
+        .header()
+        .and_then(|h| h.filename())
+        .and_then(|f| std::str::from_utf8(f).ok())
+        .unwrap_or("compressed_content")
+        .to_string();
+    let size = decompressed.len() as u64;
+    let metadata = ArchiveMetadata {
+        format: "GZIP".to_string(),
+        file_list: vec![ArchiveEntry {
+            path: filename.clone(),
+            size,
+            is_dir: false,
+        }],
+        file_count: 1,
+        total_size: size,
+    };
+    let mut contents = HashMap::new();
+    if let Ok(text) = String::from_utf8(decompressed) {
+        contents.insert(filename, text);
+    }
+    Ok((metadata, contents))
+}
+/// Extract metadata from a gzip-compressed file.
+///
+/// Gzip wraps a single stream, so the metadata contains one entry
+/// with the original filename (from gzip header) and decompressed size.
+pub fn extract_gzip_metadata(bytes: &[u8], limits: &SecurityLimits) -> Result<ArchiveMetadata> {
+    let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
+    let mut decoder = GzDecoder::new(bytes);
+    let mut _discard = [0u8; 1];
+    let _ = decoder.read(&mut _discard);
+    let filename = decoder
+        .header()
+        .and_then(|h| h.filename())
+        .and_then(|f| std::str::from_utf8(f).ok())
+        .unwrap_or("compressed_content")
+        .to_string();
+    let size = decompressed.len() as u64;
+    Ok(ArchiveMetadata {
+        format: "GZIP".to_string(),
+        file_list: vec![ArchiveEntry {
+            path: filename,
+            size,
+            is_dir: false,
+        }],
+        file_count: 1,
+        total_size: size,
+    })
+}
+/// Extract text content from a gzip-compressed file.
+///
+/// Decompresses and attempts to read the result as UTF-8 text.
+pub fn extract_gzip_text_content(bytes: &[u8], limits: &SecurityLimits) -> Result<HashMap<String, String>> {
+    let decompressed = decompress_gzip_limited(bytes, limits.max_archive_size as u64)?;
+    let mut decoder = GzDecoder::new(bytes);
+    let mut _discard = [0u8; 1];
+    let _ = decoder.read(&mut _discard);
+    let filename = decoder
+        .header()
+        .and_then(|h| h.filename())
+        .and_then(|f| std::str::from_utf8(f).ok())
+        .unwrap_or("compressed_content")
+        .to_string();
+    let mut contents = HashMap::new();
+    if let Ok(text) = String::from_utf8(decompressed) {
+        contents.insert(filename, text);
+    }
+    Ok(contents)
+}