RubyGems - kreuzberg - Versions diffs - 4.2.14 → 4.2.15 - Mend

kreuzberg 4.2.14 → 4.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/kreuzberg.gemspec +6 -5
data/lib/kreuzberg/config.rb +8 -3
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +1 -0
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +2 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/core/mime.rs +2 -0
data/vendor/kreuzberg/src/extraction/email.rs +57 -9
data/vendor/kreuzberg/src/extractors/docx.rs +85 -50
data/vendor/kreuzberg/src/extractors/mod.rs +6 -10
data/vendor/kreuzberg/src/extractors/odt.rs +95 -30
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +22 -147
data/vendor/kreuzberg/src/extractors/pptx.rs +34 -18
data/vendor/kreuzberg/tests/content_parity_debug.rs +280 -0
data/vendor/kreuzberg-ffi/src/string_intern.rs +9 -7
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +10 -7

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: da61e06dfa4643e485c13636998888f03699816b7462087c9df6c9639d53fc45
-  data.tar.gz: 20a9c88f3eac809d2d158e15ea3747c425d47b3af0e2bf93825c831c9aa11aa9
+  metadata.gz: '09cd6cd5af8800892e58b09ade91500bac99dd1149569bde1721195cc52e94a2'
+  data.tar.gz: 7e4c00ce10c8ee8b576f9b6a1634112b9d3b21daee1bbb69f73756027bcb8876
 SHA512:
-  metadata.gz: 7be55db6494d45de03b3fee1271e1bc151193709098bdfe94fb7a5fb33159dd9a0b8b08fffd5ed2d3b24f3f3766c0bb1e81b42319d25b529088afe7e6a4c52d6
-  data.tar.gz: bd94796f90094ca64775c0ded247bc216fdfe6ee50d4c6258685ccc5b5b33e1a69546cd892ac7dce77eee72123cbaf6ce239ee932a117c8ad5dfe801a9a548bf
+  metadata.gz: 4eaa814c5d1d2ab357df797f39614c8fcc0013fac82863ac5bd306c61e05a6eb8a026e2b463876540e9c4c74422f6bcd058e8d279fa1ccbfcfa3fc94a1e9b815
+  data.tar.gz: b3068c76b4f6410640b5f9110a8e8cbf52b9ee8395fa11f0a2fe078ae1ecf8ce77070edd8786a063f835320ec2d1e80f6cb7f7439eacb794377d6073c8956af5

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    kreuzberg (4.2.14)
+    kreuzberg (4.2.15)
       rb_sys (~> 0.9.119)
 GEM
@@ -209,7 +209,7 @@ CHECKSUMS
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
   json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
-  kreuzberg (4.2.14)
+  kreuzberg (4.2.15)
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.14" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.15" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -37,7 +37,7 @@ collapsible_if = "allow"
 [package]
 name = "kreuzberg-rb"
-version = "4.2.14"
+version = "4.2.15"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/kreuzberg.gemspec CHANGED Viewed

@@ -169,11 +169,12 @@ Gem::Specification.new do |spec|
   spec.authors = ['Na\'aman Hirschfeld']
   spec.email = ['nhirschfeld@gmail.com']
-  spec.summary = 'High-performance document intelligence framework'
+  spec.summary = 'Document intelligence library — extract text from PDFs, Office docs, images, and 62+ formats'
   spec.description = <<~DESC
-    Kreuzberg is a multi-language document intelligence framework with a high-performance
-    Rust core. Supports extraction, OCR, chunking, and language detection for 30+ file formats
-    including PDF, DOCX, PPTX, XLSX, images, and more.
+    Kreuzberg is a high-performance document intelligence library with a Rust core and native
+    Ruby bindings via Magnus. Extract text, metadata, and structured data from 62+ file formats
+    including PDF, DOCX, PPTX, XLSX, HTML, RTF, images (with OCR), email, archives, and more.
+    Features async/sync APIs, text chunking, language detection, and keyword extraction.
   DESC
   spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
   spec.license = 'MIT'
@@ -186,7 +187,7 @@ Gem::Specification.new do |spec|
     'documentation_uri' => 'https://docs.kreuzberg.dev',
     'bug_tracker_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/issues',
     'rubygems_mfa_required' => 'true',
-    'keywords' => 'document-intelligence,document-extraction,ocr,rust,bindings'
+    'keywords' => 'document-intelligence,document-extraction,text-extraction,ocr,pdf,rust,native-extension,nlp,rag'
   }
   spec.files = files

data/lib/kreuzberg/config.rb CHANGED Viewed

@@ -707,7 +707,8 @@ module Kreuzberg
                   :ocr, :chunking, :language_detection, :pdf_options,
                   :images, :postprocessor,
                   :token_reduction, :keywords, :html_options, :pages,
-                  :max_concurrent_extractions, :output_format, :result_format
+                  :max_concurrent_extractions, :output_format, :result_format,
+                  :security_limits
       # Alias for backward compatibility - image_extraction is the canonical name
       alias image_extraction images
@@ -732,6 +733,7 @@ module Kreuzberg
         language_detection pdf_options image_extraction
         postprocessor token_reduction keywords html_options pages
         max_concurrent_extractions output_format result_format
+        security_limits
       ].freeze
       # Aliases for backward compatibility
@@ -804,7 +806,8 @@ module Kreuzberg
                      pages: nil,
                      max_concurrent_extractions: nil,
                      output_format: nil,
-                     result_format: nil)
+                     result_format: nil,
+                     security_limits: nil)
         kwargs = {
           use_cache: use_cache, enable_quality_processing: enable_quality_processing,
           force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
@@ -812,7 +815,8 @@ module Kreuzberg
           postprocessor: postprocessor,
           token_reduction: token_reduction, keywords: keywords, html_options: html_options,
           pages: pages, max_concurrent_extractions: max_concurrent_extractions,
-          output_format: output_format, result_format: result_format
+          output_format: output_format, result_format: result_format,
+          security_limits: security_limits
         }
         extracted = extract_from_hash(hash, kwargs)
@@ -843,6 +847,7 @@ module Kreuzberg
         @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
         @output_format = validate_output_format(params[:output_format])
         @result_format = validate_result_format(params[:result_format])
+        @security_limits = params[:security_limits]
       end
       def validate_output_format(value)

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.2.14'
+  VERSION = '4.2.15'
 end

data/sig/kreuzberg.rbs CHANGED Viewed

@@ -242,6 +242,7 @@ module Kreuzberg
       attr_reader max_concurrent_extractions: Integer?
       attr_reader output_format: String?
       attr_reader result_format: String?
+      attr_reader security_limits: Hash[String, Integer]?
       alias image_extraction images

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
 resolver = "2"
 [workspace.package]
-version = "4.2.14"
+version = "4.2.15"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,10 +1,10 @@
 [package]
 name = "kreuzberg"
-version = "4.2.14"
+version = "4.2.15"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
-description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats with async/sync APIs."
+description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 62+ formats with async/sync APIs."
 license = "MIT"
 repository = "https://github.com/kreuzberg-dev/kreuzberg"
 homepage = "https://kreuzberg.dev"

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.2.14 Release**
+> **🚀 Version 4.2.15 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/core/mime.rs CHANGED Viewed

@@ -118,6 +118,8 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
     m.insert("fb2", "application/x-fictionbook+xml");
     m.insert("opml", "application/xml+opml");
     m.insert("dbk", "application/docbook+xml");
+    m.insert("docbook", "application/docbook+xml");
+    m.insert("jats", "application/x-jats+xml");
     m.insert("ipynb", "application/x-ipynb+json");
     m.insert("tex", "application/x-latex");
     m.insert("latex", "application/x-latex");

data/vendor/kreuzberg/src/extraction/email.rs CHANGED Viewed

@@ -54,10 +54,61 @@ fn whitespace_regex() -> &'static Regex {
     WHITESPACE_RE.get_or_init(|| Regex::new(r"\s+").unwrap())
 }
+/// Detect UTF-16 encoding (with or without BOM) and transcode to UTF-8 if needed.
+///
+/// `mail_parser` expects ASCII/UTF-8 input. If the EML file is encoded as
+/// UTF-16, we transcode it to UTF-8 first.
+///
+/// Detection strategy:
+/// 1. Check for BOM (`FF FE` = LE, `FE FF` = BE)
+/// 2. If no BOM, use heuristic: EML files start with ASCII headers, so
+///    alternating zero bytes indicate UTF-16 encoding.
+fn maybe_transcode_utf16(data: &[u8]) -> Option<Vec<u8>> {
+    if data.len() < 4 {
+        return None;
+    }
+    let (is_le, skip) = if data[0] == 0xFF && data[1] == 0xFE {
+        (true, 2)
+    } else if data[0] == 0xFE && data[1] == 0xFF {
+        (false, 2)
+    } else if data[1] == 0x00 && data[3] == 0x00 && data[0] != 0x00 && data[2] != 0x00 {
+        // No BOM, but looks like UTF-16 LE (e.g. "M\0I\0M\0E\0")
+        (true, 0)
+    } else if data[0] == 0x00 && data[2] == 0x00 && data[1] != 0x00 && data[3] != 0x00 {
+        // No BOM, but looks like UTF-16 BE (e.g. "\0M\0I\0M\0E")
+        (false, 0)
+    } else {
+        return None;
+    };
+    let payload = &data[skip..];
+    let even_len = payload.len() & !1;
+    let u16_iter = (0..even_len).step_by(2).map(|i| {
+        if is_le {
+            u16::from_le_bytes([payload[i], payload[i + 1]])
+        } else {
+            u16::from_be_bytes([payload[i], payload[i + 1]])
+        }
+    });
+    match String::from_utf16(&u16_iter.collect::<Vec<u16>>()) {
+        Ok(s) => Some(s.into_bytes()),
+        Err(_) => None,
+    }
+}
 /// Parse .eml file content (RFC822 format)
 pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
+    // Transcode UTF-16 to UTF-8 if a BOM is detected
+    let data = if let Some(transcoded) = maybe_transcode_utf16(data) {
+        std::borrow::Cow::Owned(transcoded)
+    } else {
+        std::borrow::Cow::Borrowed(data)
+    };
     let message = mail_parser::MessageParser::default()
-        .parse(data)
+        .parse(&data)
         .ok_or_else(|| KreuzbergError::parsing("Failed to parse EML file: invalid email format".to_string()))?;
     let subject = message.subject().map(|s| s.to_string());
@@ -293,14 +344,11 @@ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
     if let Some(ref msg_id) = message_id {
         metadata.insert("message_id".to_string(), msg_id.to_string());
     }
-    if !attachments.is_empty() {
-        let attachment_names: Vec<String> = attachments
-            .iter()
-            .filter_map(|a| a.filename.as_ref())
-            .cloned()
-            .collect();
-        metadata.insert("attachments".to_string(), attachment_names.join(", "));
-    }
+    // NOTE: Do NOT insert "attachments" into the metadata HashMap here.
+    // The attachments are already stored in EmailMetadata.attachments (Vec<String>).
+    // Since both `format` and `additional` use #[serde(flatten)], inserting a
+    // comma-joined string here would overwrite the structured array, breaking
+    // deserialization in Go, C#, and other typed bindings.
     Ok(EmailExtractionResult {
         subject,

data/vendor/kreuzberg/src/extractors/docx.rs CHANGED Viewed

@@ -1,4 +1,4 @@
-#![cfg(all(feature = "tokio-runtime", feature = "office"))]
+#![cfg(feature = "office")]
 //! DOCX extractor for high-performance text extraction.
 //!
@@ -8,7 +8,9 @@ use crate::Result;
 use crate::core::config::ExtractionConfig;
 use crate::extraction::{cells_to_markdown, office_metadata};
 use crate::plugins::{DocumentExtractor, Plugin};
-use crate::types::{ExtractionResult, Metadata, PageBoundary, PageInfo, PageStructure, PageUnitType, Table};
+#[cfg(feature = "tokio-runtime")]
+use crate::types::PageBoundary;
+use crate::types::{ExtractionResult, Metadata, PageInfo, PageStructure, PageUnitType, Table};
 use ahash::AHashMap;
 use async_trait::async_trait;
 use std::borrow::Cow;
@@ -112,63 +114,96 @@ impl DocumentExtractor for DocxExtractor {
         mime_type: &str,
         _config: &ExtractionConfig,
     ) -> Result<ExtractionResult> {
-        let (text, tables, page_boundaries) = if crate::core::batch_mode::is_batch_mode() {
-            let content_owned = content.to_vec();
-            let span = tracing::Span::current();
-            tokio::task::spawn_blocking(
-                move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
-                    let _guard = span.entered();
-                    let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
-                    let text = doc.extract_text();
-                    let tables: Vec<Table> = doc
-                        .tables
-                        .iter()
-                        .enumerate()
-                        .map(|(idx, table)| convert_docx_table_to_table(table, idx))
-                        .collect();
-                    let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(&content_owned)?;
+        let (text, tables, page_boundaries) = {
+            #[cfg(feature = "tokio-runtime")]
+            if crate::core::batch_mode::is_batch_mode() {
+                let content_owned = content.to_vec();
+                let span = tracing::Span::current();
+                tokio::task::spawn_blocking(
+                    move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
+                        let _guard = span.entered();
+                        let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
+                        let text = doc.extract_text();
+                        let tables: Vec<Table> = doc
+                            .tables
+                            .iter()
+                            .enumerate()
+                            .map(|(idx, table)| convert_docx_table_to_table(table, idx))
+                            .collect();
+                        let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(&content_owned)?;
+                        Ok((text, tables, page_boundaries))
+                    },
+                )
+                .await
+                .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
+            } else {
+                let doc = crate::extraction::docx::parser::parse_document(content)?;
+                let text = doc.extract_text();
+                let tables: Vec<Table> = doc
+                    .tables
+                    .iter()
+                    .enumerate()
+                    .map(|(idx, table)| convert_docx_table_to_table(table, idx))
+                    .collect();
+                let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
+                (text, tables, page_boundaries)
+            }
-                    Ok((text, tables, page_boundaries))
-                },
-            )
-            .await
-            .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
-        } else {
-            let doc = crate::extraction::docx::parser::parse_document(content)?;
+            #[cfg(not(feature = "tokio-runtime"))]
+            {
+                let doc = crate::extraction::docx::parser::parse_document(content)?;
-            let text = doc.extract_text();
+                let text = doc.extract_text();
-            let tables: Vec<Table> = doc
-                .tables
-                .iter()
-                .enumerate()
-                .map(|(idx, table)| convert_docx_table_to_table(table, idx))
-                .collect();
+                let tables: Vec<Table> = doc
+                    .tables
+                    .iter()
+                    .enumerate()
+                    .map(|(idx, table)| convert_docx_table_to_table(table, idx))
+                    .collect();
-            let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
+                let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
-            (text, tables, page_boundaries)
+                (text, tables, page_boundaries)
+            }
         };
-        let mut archive = if crate::core::batch_mode::is_batch_mode() {
-            let content_owned = content.to_vec();
-            let span = tracing::Span::current();
-            tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
-                let _guard = span.entered();
+        let mut archive = {
+            #[cfg(feature = "tokio-runtime")]
+            if crate::core::batch_mode::is_batch_mode() {
+                let content_owned = content.to_vec();
+                let span = tracing::Span::current();
+                tokio::task::spawn_blocking(move || -> crate::error::Result<_> {
+                    let _guard = span.entered();
+                    let cursor = Cursor::new(content_owned);
+                    zip::ZipArchive::new(cursor).map_err(|e| {
+                        crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e))
+                    })
+                })
+                .await
+                .map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
+            } else {
+                let content_owned = content.to_vec();
                 let cursor = Cursor::new(content_owned);
                 zip::ZipArchive::new(cursor)
-                    .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))
-            })
-            .await
-            .map_err(|e| crate::error::KreuzbergError::parsing(format!("Task join error: {}", e)))??
-        } else {
-            let content_owned = content.to_vec();
-            let cursor = Cursor::new(content_owned);
-            zip::ZipArchive::new(cursor)
-                .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
+                    .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
+            }
+            #[cfg(not(feature = "tokio-runtime"))]
+            {
+                let content_owned = content.to_vec();
+                let cursor = Cursor::new(content_owned);
+                zip::ZipArchive::new(cursor)
+                    .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
+            }
         };
         let mut metadata_map = AHashMap::new();

data/vendor/kreuzberg/src/extractors/mod.rs CHANGED Viewed

@@ -91,7 +91,7 @@ pub mod bibtex;
 #[cfg(feature = "office")]
 pub mod citation;
-#[cfg(all(feature = "tokio-runtime", feature = "office"))]
+#[cfg(feature = "office")]
 pub mod docx;
 #[cfg(feature = "office")]
@@ -115,7 +115,7 @@ pub mod jupyter;
 #[cfg(feature = "office")]
 pub mod orgmode;
-#[cfg(all(feature = "tokio-runtime", feature = "office"))]
+#[cfg(feature = "office")]
 pub mod odt;
 #[cfg(feature = "office")]
@@ -130,7 +130,7 @@ pub mod jats;
 #[cfg(feature = "pdf")]
 pub mod pdf;
-#[cfg(all(feature = "tokio-runtime", feature = "office"))]
+#[cfg(feature = "office")]
 pub mod pptx;
 #[cfg(feature = "office")]
@@ -166,7 +166,7 @@ pub use bibtex::BibtexExtractor;
 #[cfg(feature = "office")]
 pub use citation::CitationExtractor;
-#[cfg(all(feature = "tokio-runtime", feature = "office"))]
+#[cfg(feature = "office")]
 pub use docx::DocxExtractor;
 #[cfg(feature = "office")]
@@ -192,7 +192,7 @@ pub use jupyter::JupyterExtractor;
 #[cfg(feature = "office")]
 pub use orgmode::OrgModeExtractor;
-#[cfg(all(feature = "tokio-runtime", feature = "office"))]
+#[cfg(feature = "office")]
 pub use odt::OdtExtractor;
 #[cfg(feature = "xml")]
@@ -207,7 +207,7 @@ pub use typst::TypstExtractor;
 #[cfg(feature = "pdf")]
 pub use pdf::PdfExtractor;
-#[cfg(all(feature = "tokio-runtime", feature = "office"))]
+#[cfg(feature = "office")]
 pub use pptx::PptxExtractor;
 #[cfg(feature = "office")]
@@ -312,10 +312,6 @@ pub fn register_default_extractors() -> Result<()> {
         registry.register(Arc::new(OrgModeExtractor::new()))?;
         registry.register(Arc::new(OpmlExtractor::new()))?;
         registry.register(Arc::new(TypstExtractor::new()))?;
-    }
-    #[cfg(all(feature = "tokio-runtime", feature = "office"))]
-    {
         registry.register(Arc::new(DocxExtractor::new()))?;
         registry.register(Arc::new(PptxExtractor::new()))?;
         registry.register(Arc::new(OdtExtractor::new()))?;