RubyGems - kreuzberg - Versions diffs - 4.6.0 → 4.6.1 - Mend

kreuzberg 4.6.0 → 4.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/ext/kreuzberg_rb/native/src/config/types.rs +18 -0
data/lib/kreuzberg/config.rb +22 -8
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +5 -1
data/vendor/Cargo.toml +3 -3
data/vendor/kreuzberg/Cargo.toml +5 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +1 -0
data/vendor/kreuzberg/src/api/openapi.rs +6 -0
data/vendor/kreuzberg/src/core/config/extraction/core.rs +28 -0
data/vendor/kreuzberg/src/core/config/extraction/file_config.rs +12 -0
data/vendor/kreuzberg/src/core/extractor/batch.rs +23 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -0
data/vendor/kreuzberg/src/error.rs +3 -0
data/vendor/kreuzberg/src/extraction/mod.rs +6 -0
data/vendor/kreuzberg/src/extraction/pst.rs +386 -0
data/vendor/kreuzberg/src/extraction/structured.rs +214 -1
data/vendor/kreuzberg/src/extraction/transform/content.rs +40 -7
data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +69 -12
data/vendor/kreuzberg/src/extraction/transform/mod.rs +159 -6
data/vendor/kreuzberg/src/extractors/email.rs +3 -3
data/vendor/kreuzberg/src/extractors/mod.rs +12 -2
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +121 -52
data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +536 -194
data/vendor/kreuzberg/src/extractors/pst.rs +264 -0
data/vendor/kreuzberg/src/extractors/structured.rs +14 -4
data/vendor/kreuzberg/src/mcp/errors.rs +5 -0
data/vendor/kreuzberg/src/pdf/layout_runner.rs +214 -226
data/vendor/kreuzberg/src/pdf/text.rs +41 -2
data/vendor/kreuzberg/test_documents/jsonl/simple.jsonl +3 -0
data/vendor/kreuzberg/test_documents/jsonl/with_blanks.jsonl +5 -0
data/vendor/kreuzberg/tests/api_consistency.rs +7 -0
data/vendor/kreuzberg/tests/jsonl_integration.rs +82 -0
data/vendor/kreuzberg/tests/pst_integration.rs +82 -0
data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
data/vendor/kreuzberg-ffi/kreuzberg.h +24 -2
data/vendor/kreuzberg-ffi/src/config/merge.rs +7 -0
data/vendor/kreuzberg-ffi/src/config_builder.rs +37 -0
data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +7 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 203e9719bcf3cf2cda1252dcd7a5c5782e7b73936a304b626a351894d4fcd909
-  data.tar.gz: 2c02a45c882ef6b6b6935896e9334c46f012aaf8bc6f6669fa3c0110b67398e5
+  metadata.gz: 7f2441d44e9083d36f9f0e60b2c155624ad251f97c6f2ca629bd3ef33aeeeb46
+  data.tar.gz: b443884a5e4bfffa1bc916928ae97997e1ca21cdfb80e76b845c454c0aa67256
 SHA512:
-  metadata.gz: d3dde81c8c38b1ee99bed3cae32e477e4c8941d401c6449fc9c3eec3608a5b771b47c20ab3a9679ccf75059fed5e6c09f9d91eefed83a7d9dc59eebf7acb5626
-  data.tar.gz: e590247800d9752175985ee3b8ad0c89c5926f1afa0669a881cf476455c8514332880d30ff35d46a2836cf9cdc18b752296fb06a545f42129b548b5675180a71
+  metadata.gz: e238336b3ceae6d2bed4bd530d393f32473a7a88f68f7e28a6300ffe2f7cadc5aee0ac834fa5712223810f9cf1f7a68a363b7e96079060565e1c9e10af9f8114
+  data.tar.gz: 48eae2f77c78ef1b9794ac3dde62c785b32ab99253b6ab7ce8da9d8bba8a57f209fb5fc6353a0bc8f5771bd825fc3b3b29652149133bd0ba1dc29e58101185bb

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.0" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.1" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-rb"
-version = "4.6.0"
+version = "4.6.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]

data/ext/kreuzberg_rb/native/src/config/types.rs CHANGED Viewed

@@ -875,6 +875,17 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
             config.force_ocr = bool::try_convert(val)?;
         }
+        if let Some(val) = get_kw(ruby, hash, "force_ocr_pages")
+            && val.equal(ruby.qnil()).ok() != Some(true)
+        {
+            let pages_array = magnus::RArray::try_convert(val)?;
+            let pages: Vec<usize> = pages_array
+                .into_iter()
+                .map(|v| usize::try_convert(v))
+                .collect::<Result<Vec<_>, _>>()?;
+            config.force_ocr_pages = Some(pages);
+        }
         if let Some(val) = get_kw(ruby, hash, "include_document_structure") {
             config.include_document_structure = bool::try_convert(val)?;
         }
@@ -1013,6 +1024,13 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
                 }
             };
         }
+        if let Some(val) = get_kw(ruby, hash, "extraction_timeout_secs")
+            && val.equal(ruby.qnil()).ok() != Some(true)
+        {
+            let secs = u64::try_convert(val)?;
+            config.extraction_timeout_secs = Some(secs);
+        }
     }
     Ok(config)

data/lib/kreuzberg/config.rb CHANGED Viewed

@@ -926,14 +926,14 @@ module Kreuzberg
     #   )
     #
     class Extraction
-      attr_reader :use_cache, :enable_quality_processing, :force_ocr,
+      attr_reader :use_cache, :enable_quality_processing, :force_ocr, :force_ocr_pages,
                   :include_document_structure,
                   :ocr, :chunking, :language_detection, :pdf_options,
                   :images, :postprocessor,
                   :token_reduction, :keywords, :html_options, :pages,
                   :max_concurrent_extractions, :output_format, :result_format,
                   :security_limits, :layout, :concurrency,
-                  :cache_namespace, :cache_ttl_secs
+                  :cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
       # Alias for backward compatibility - image_extraction is the canonical name
       alias image_extraction images
@@ -954,11 +954,11 @@ module Kreuzberg
       #
       # Keys that are allowed in the Extraction config
       ALLOWED_KEYS = %i[
-        use_cache enable_quality_processing force_ocr include_document_structure ocr chunking
+        use_cache enable_quality_processing force_ocr force_ocr_pages include_document_structure ocr chunking
         language_detection pdf_options image_extraction
         postprocessor token_reduction keywords html_options pages
         max_concurrent_extractions output_format result_format
-        security_limits layout concurrency cache_namespace cache_ttl_secs
+        security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
       ].freeze
       # Aliases for backward compatibility
@@ -1019,6 +1019,7 @@ module Kreuzberg
                      use_cache: true,
                      enable_quality_processing: true,
                      force_ocr: false,
+                     force_ocr_pages: nil,
                      include_document_structure: false,
                      ocr: nil,
                      chunking: nil,
@@ -1037,10 +1038,12 @@ module Kreuzberg
                      layout: nil,
                      concurrency: nil,
                      cache_namespace: nil,
-                     cache_ttl_secs: nil)
+                     cache_ttl_secs: nil,
+                     extraction_timeout_secs: nil)
         kwargs = {
           use_cache: use_cache, enable_quality_processing: enable_quality_processing,
-          force_ocr: force_ocr, include_document_structure: include_document_structure,
+          force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
+          include_document_structure: include_document_structure,
           ocr: ocr, chunking: chunking, language_detection: language_detection,
           pdf_options: pdf_options, image_extraction: image_extraction,
           postprocessor: postprocessor,
@@ -1050,7 +1053,8 @@ module Kreuzberg
           security_limits: security_limits, layout: layout,
           concurrency: concurrency,
           cache_namespace: cache_namespace,
-          cache_ttl_secs: cache_ttl_secs
+          cache_ttl_secs: cache_ttl_secs,
+          extraction_timeout_secs: extraction_timeout_secs
         }
         extracted = extract_from_hash(hash, kwargs)
@@ -1068,6 +1072,7 @@ module Kreuzberg
         @use_cache = params[:use_cache] ? true : false
         @enable_quality_processing = params[:enable_quality_processing] ? true : false
         @force_ocr = params[:force_ocr] ? true : false
+        @force_ocr_pages = params[:force_ocr_pages]
         @include_document_structure = params[:include_document_structure] ? true : false
         @ocr = normalize_config(params[:ocr], OCR)
         @chunking = normalize_config(params[:chunking], Chunking)
@@ -1086,6 +1091,7 @@ module Kreuzberg
         @result_format = validate_result_format(params[:result_format])
         @cache_namespace = params[:cache_namespace]
         @cache_ttl_secs = params[:cache_ttl_secs]&.to_i
+        @extraction_timeout_secs = params[:extraction_timeout_secs]&.to_i
         @security_limits = params[:security_limits]
       end
@@ -1118,12 +1124,14 @@ module Kreuzberg
           use_cache: @use_cache,
           enable_quality_processing: @enable_quality_processing,
           force_ocr: @force_ocr,
+          force_ocr_pages: @force_ocr_pages,
           include_document_structure: @include_document_structure,
           max_concurrent_extractions: @max_concurrent_extractions,
           output_format: @output_format,
           result_format: @result_format,
           cache_namespace: @cache_namespace,
-          cache_ttl_secs: @cache_ttl_secs
+          cache_ttl_secs: @cache_ttl_secs,
+          extraction_timeout_secs: @extraction_timeout_secs
         }
       end
@@ -1250,6 +1258,8 @@ module Kreuzberg
           @enable_quality_processing = value ? true : false
         when :force_ocr
           @force_ocr = value ? true : false
+        when :force_ocr_pages
+          @force_ocr_pages = value
         when :include_document_structure
           @include_document_structure = value ? true : false
         when :ocr
@@ -1286,6 +1296,8 @@ module Kreuzberg
           @cache_namespace = value
         when :cache_ttl_secs
           @cache_ttl_secs = value&.to_i
+        when :extraction_timeout_secs
+          @extraction_timeout_secs = value&.to_i
         else
           raise ArgumentError, "Unknown configuration key: #{key}"
         end
@@ -1345,6 +1357,7 @@ module Kreuzberg
         @use_cache = merged.use_cache
         @enable_quality_processing = merged.enable_quality_processing
         @force_ocr = merged.force_ocr
+        @force_ocr_pages = merged.force_ocr_pages
         @include_document_structure = merged.include_document_structure
         @ocr = merged.ocr
         @chunking = merged.chunking
@@ -1369,6 +1382,7 @@ module Kreuzberg
         @result_format = merged.result_format
         @cache_namespace = merged.cache_namespace
         @cache_ttl_secs = merged.cache_ttl_secs
+        @extraction_timeout_secs = merged.extraction_timeout_secs
       end
     end
   end

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.6.0'
+  VERSION = '4.6.1'
 end

data/sig/kreuzberg.rbs CHANGED Viewed

@@ -481,7 +481,9 @@ module Kreuzberg
       attr_reader enable_quality_processing: bool
       attr_reader cache_namespace: String?
       attr_reader cache_ttl_secs: Integer?
+      attr_reader extraction_timeout_secs: Integer?
       attr_reader force_ocr: bool
+      attr_reader force_ocr_pages: Array[Integer]?
       attr_reader include_document_structure: bool
       attr_reader ocr: OCR?
       attr_reader chunking: Chunking?
@@ -508,6 +510,7 @@ module Kreuzberg
         ?use_cache: bool,
         ?enable_quality_processing: bool,
         ?force_ocr: bool,
+        ?force_ocr_pages: Array[Integer]?,
         ?include_document_structure: bool,
         ?ocr: (OCR | Hash[Symbol, untyped])?,
         ?chunking: (Chunking | Hash[Symbol, untyped])?,
@@ -525,7 +528,8 @@ module Kreuzberg
         ?output_format: String?,
         ?result_format: String?,
         ?cache_namespace: String?,
-        ?cache_ttl_secs: Integer?
+        ?cache_ttl_secs: Integer?,
+        ?extraction_timeout_secs: Integer?
       ) -> void
       def to_h: () -> Hash[Symbol, untyped]
       def to_json: (*untyped) -> String

data/vendor/Cargo.toml CHANGED Viewed

@@ -2,7 +2,7 @@
 members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
 [workspace.package]
-version = "4.6.0"
+version = "4.6.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -30,8 +30,8 @@ html-to-markdown-rs = { version = "2.29.0", default-features = false }
 image = { version = "0.25.10", default-features = false }
 itertools = "0.14"
 js-sys = "0.3"
-kreuzberg = { path = "./crates/kreuzberg", version = "4.6.0", default-features = false }
-kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.0" }
+kreuzberg = { path = "./crates/kreuzberg", version = "4.6.1", default-features = false }
+kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.1" }
 lazy_static = "1.5.0"
 libc = "0.2.183"
 log = "0.4"

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.6.0"
+version = "4.6.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -53,7 +53,7 @@ office = [
 ]
 hwp = ["dep:cfb", "dep:flate2"]
 iwork = ["dep:zip", "dep:snap"]
-email = ["dep:mail-parser", "dep:cfb"]
+email = ["dep:mail-parser", "dep:cfb", "dep:outlook-pst", "dep:tempfile", "dep:chrono"]
 html = ["dep:html-to-markdown-rs"]
 xml = ["dep:quick-xml", "dep:roxmltree"]
 archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
@@ -238,6 +238,7 @@ bytes = { version = "1", features = ["serde"] }
 calamine = { version = "0.34.0", features = ["dates"], optional = true }
 cfb = { version = "0.14", optional = true }
 chardetng = { version = "0.1.17", optional = true }
+chrono = { version = "0.4", optional = true }
 dashmap = "6.1"
 dbase = { version = "0.7", optional = true }
 encoding_rs = { version = "0.8.35" }
@@ -287,6 +288,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
     "ndarray",
     "api-18",
 ], optional = true }
+outlook-pst = { version = "1.2.0", optional = true }
 parking_lot = "0.12.5"
 pastey = "0.2"
 pdf_oxide = { version = "0.3.17", default-features = false, optional = true }
@@ -315,6 +317,7 @@ sha2 = { version = "0.10", optional = true }
 simdutf8 = { version = "0.1", optional = true }
 snap = { version = "1.1", optional = true }
 tar = { version = "0.4.45", optional = true }
+tempfile = { version = "3.27.0", optional = true }
 text-splitter = { version = "0.29.3", features = ["markdown"], optional = true }
 thiserror = "2.0.18"
 tiff = { version = "0.11", optional = true }

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.6.0 Release**
+> **🚀 Version 4.6.1 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/api/error.rs CHANGED Viewed

@@ -115,6 +115,7 @@ impl ApiError {
             KreuzbergError::Plugin { .. } => "PluginError",
             KreuzbergError::LockPoisoned(_) => "LockPoisonedError",
             KreuzbergError::UnsupportedFormat(_) => "UnsupportedFormatError",
+            KreuzbergError::Timeout { .. } => "TimeoutError",
             KreuzbergError::Other(_) => "Error",
         };

data/vendor/kreuzberg/src/api/openapi.rs CHANGED Viewed

@@ -74,6 +74,12 @@ use utoipa::OpenApi;
             crate::types::extraction::ElementId,
             crate::types::extraction::ElementType,
             crate::types::extraction::BoundingBox,
+            crate::types::ocr_elements::OcrElement,
+            crate::types::ocr_elements::OcrBoundingGeometry,
+            crate::types::ocr_elements::OcrConfidence,
+            crate::types::ocr_elements::OcrRotation,
+            crate::types::ocr_elements::OcrElementLevel,
+            crate::types::ocr_elements::OcrElementConfig,
             crate::types::metadata::Metadata,
             crate::types::tables::Table,
             crate::types::page::PageContent,

data/vendor/kreuzberg/src/core/config/extraction/core.rs CHANGED Viewed

@@ -47,6 +47,16 @@ pub struct ExtractionConfig {
     #[serde(default)]
     pub force_ocr: bool,
+    /// Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
+    ///
+    /// When set, only the listed pages are OCR'd regardless of text layer quality.
+    /// Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
+    /// Only applies to PDF documents. Duplicates are automatically deduplicated.
+    /// An `ocr` config is recommended for backend/language selection; defaults are used if absent.
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub force_ocr_pages: Option<Vec<usize>>,
     /// Text chunking configuration (None = chunking disabled)
     #[serde(default)]
     pub chunking: Option<ChunkingConfig>,
@@ -89,6 +99,14 @@ pub struct ExtractionConfig {
     #[serde(default)]
     pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
+    /// Default per-file timeout in seconds for batch extraction.
+    ///
+    /// When set, each file in a batch will be canceled after this duration
+    /// unless overridden by [`FileExtractionConfig::timeout_secs`].
+    /// `None` means no timeout (unbounded extraction time).
+    #[serde(default)]
+    pub extraction_timeout_secs: Option<u64>,
     /// Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
     ///
     /// Limits parallelism to prevent resource exhaustion when processing
@@ -201,6 +219,7 @@ impl Default for ExtractionConfig {
             enable_quality_processing: true,
             ocr: None,
             force_ocr: false,
+            force_ocr_pages: None,
             chunking: None,
             images: None,
             #[cfg(feature = "pdf")]
@@ -213,6 +232,7 @@ impl Default for ExtractionConfig {
             postprocessor: None,
             #[cfg(feature = "html")]
             html_options: None,
+            extraction_timeout_secs: None,
             max_concurrent_extractions: None,
             #[cfg(feature = "archives")]
             security_limits: None,
@@ -259,6 +279,7 @@ impl ExtractionConfig {
             ref enable_quality_processing,
             ref ocr,
             ref force_ocr,
+            ref force_ocr_pages,
             ref chunking,
             ref images,
             #[cfg(feature = "pdf")]
@@ -276,6 +297,7 @@ impl ExtractionConfig {
             ref include_document_structure,
             #[cfg(feature = "layout-detection")]
             ref layout,
+            ref timeout_secs,
         } = *overrides;
         let mut config = self.clone();
@@ -289,6 +311,9 @@ impl ExtractionConfig {
         if let Some(v) = force_ocr {
             config.force_ocr = *v;
         }
+        if let Some(v) = force_ocr_pages {
+            config.force_ocr_pages = Some(v.clone());
+        }
         if let Some(v) = chunking {
             config.chunking = Some(v.clone());
         }
@@ -332,6 +357,9 @@ impl ExtractionConfig {
         if let Some(v) = layout {
             config.layout = Some(v.clone());
         }
+        if let Some(v) = timeout_secs {
+            config.extraction_timeout_secs = Some(*v);
+        }
         config
     }

data/vendor/kreuzberg/src/core/config/extraction/file_config.rs CHANGED Viewed

@@ -57,6 +57,10 @@ pub struct FileExtractionConfig {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub force_ocr: Option<bool>,
+    /// Override force OCR pages for this file (1-indexed page numbers).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub force_ocr_pages: Option<Vec<usize>>,
     /// Override chunking configuration for this file.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub chunking: Option<ChunkingConfig>,
@@ -112,4 +116,12 @@ pub struct FileExtractionConfig {
     #[cfg(feature = "layout-detection")]
     #[serde(skip_serializing_if = "Option::is_none")]
     pub layout: Option<super::super::layout::LayoutDetectionConfig>,
+    /// Override per-file extraction timeout in seconds.
+    ///
+    /// When set, the extraction for this file will be canceled after the
+    /// specified duration. A timed-out file produces an error result without
+    /// affecting other files in the batch.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub timeout_secs: Option<u64>,
 }

data/vendor/kreuzberg/src/core/extractor/batch.rs CHANGED Viewed

@@ -63,11 +63,12 @@ where
     Ok(results.into_iter().map(|r| r.unwrap()).collect())
 }
-/// Run a single extraction task with semaphore gating, timing, and batch mode.
+/// Run a single extraction task with semaphore gating, timing, optional timeout, and batch mode.
 #[cfg(feature = "tokio-runtime")]
 async fn run_timed_extraction<F, Fut>(
     index: usize,
     semaphore: Arc<tokio::sync::Semaphore>,
+    timeout_secs: Option<u64>,
     extract_fn: F,
 ) -> (usize, Result<ExtractionResult>, u64)
 where
@@ -76,7 +77,23 @@ where
 {
     let _permit = semaphore.acquire().await.unwrap();
     let start = Instant::now();
-    let mut result = crate::core::batch_mode::with_batch_mode(extract_fn()).await;
+    let extraction_future = crate::core::batch_mode::with_batch_mode(extract_fn());
+    let mut result = match timeout_secs {
+        Some(secs) => match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
+            Ok(inner) => inner,
+            Err(_elapsed) => {
+                let elapsed_ms = start.elapsed().as_millis() as u64;
+                Err(KreuzbergError::Timeout {
+                    elapsed_ms,
+                    limit_ms: secs * 1000,
+                })
+            }
+        },
+        None => extraction_future.await,
+    };
     let elapsed_ms = start.elapsed().as_millis() as u64;
     if let Ok(ref mut r) = result {
@@ -182,7 +199,8 @@ pub async fn batch_extract_file(
         async move {
             let (ref path, ref file_config) = items[index];
             let resolved = resolve_config(&cfg, file_config);
-            run_timed_extraction(index, sem, || {
+            let timeout = resolved.extraction_timeout_secs;
+            run_timed_extraction(index, sem, timeout, || {
                 let path = path.clone();
                 async move { extract_file(&path, None, &resolved).await }
             })
@@ -282,7 +300,8 @@ pub async fn batch_extract_bytes(
         async move {
             let (bytes, mime_type, file_config) = slots[index].lock().take().expect("batch item already consumed");
             let resolved = resolve_config(&cfg, &file_config);
-            run_timed_extraction(index, sem, || async move {
+            let timeout = resolved.extraction_timeout_secs;
+            run_timed_extraction(index, sem, timeout, || async move {
                 extract_bytes(&bytes, &mime_type, &resolved).await
             })
             .await

data/vendor/kreuzberg/src/core/mime.rs CHANGED Viewed

@@ -35,7 +35,9 @@ pub const LEGACY_POWERPOINT_MIME_TYPE: &str = "application/vnd.ms-powerpoint";
 pub const EML_MIME_TYPE: &str = "message/rfc822";
 pub const MSG_MIME_TYPE: &str = "application/vnd.ms-outlook";
+pub const PST_MIME_TYPE: &str = "application/vnd.ms-outlook-pst";
 pub const JSON_MIME_TYPE: &str = "application/json";
+pub const JSONL_MIME_TYPE: &str = "application/x-ndjson";
 pub const YAML_MIME_TYPE: &str = "application/x-yaml";
 pub const TOML_MIME_TYPE: &str = "application/toml";
 pub const XML_MIME_TYPE: &str = "application/xml";
@@ -368,6 +370,11 @@ static FORMATS: &[FormatEntry] = &[
         mime_type: "application/csl+json",
         aliases: &[],
     },
+    FormatEntry {
+        extensions: &["jsonl", "ndjson"],
+        mime_type: "application/x-ndjson",
+        aliases: &["application/jsonl", "application/x-jsonlines"],
+    },
     FormatEntry {
         extensions: &["yaml", "yml"],
         mime_type: "application/x-yaml",
@@ -399,6 +406,11 @@ static FORMATS: &[FormatEntry] = &[
         mime_type: "application/vnd.ms-outlook",
         aliases: &[],
     },
+    FormatEntry {
+        extensions: &["pst"],
+        mime_type: "application/vnd.ms-outlook-pst",
+        aliases: &[],
+    },
     // ── Archives ────────────────────────────────────────────────────────
     FormatEntry {
         extensions: &["zip"],

data/vendor/kreuzberg/src/error.rs CHANGED Viewed

@@ -124,6 +124,9 @@ pub enum KreuzbergError {
     #[error("Unsupported format: {0}")]
     UnsupportedFormat(String),
+    #[error("Extraction timed out after {elapsed_ms}ms (limit: {limit_ms}ms)")]
+    Timeout { elapsed_ms: u64, limit_ms: u64 },
     #[error("{0}")]
     Other(String),
 }

data/vendor/kreuzberg/src/extraction/mod.rs CHANGED Viewed

@@ -22,6 +22,9 @@ pub mod archive;
 #[cfg(feature = "email")]
 pub mod email;
+#[cfg(feature = "email")]
+pub mod pst;
 #[cfg(any(feature = "excel", feature = "excel-wasm"))]
 pub mod excel;
@@ -77,6 +80,9 @@ pub use archive::{
 #[cfg(feature = "email")]
 pub use email::{build_email_text_output, extract_email_content, parse_eml_content, parse_msg_content};
+#[cfg(feature = "email")]
+pub use pst::extract_pst_messages;
 #[cfg(any(feature = "excel", feature = "excel-wasm"))]
 pub use excel::{excel_to_markdown, read_excel_bytes, read_excel_file};