RubyGems - kreuzberg - Versions diffs - 4.2.0 → 4.2.2 - Mend

kreuzberg 4.2.0 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
data/lib/kreuzberg/cli.rb +16 -6
data/lib/kreuzberg/cli_proxy.rb +3 -1
data/lib/kreuzberg/config.rb +59 -28
data/lib/kreuzberg/djot_content.rb +225 -0
data/lib/kreuzberg/extraction_api.rb +20 -4
data/lib/kreuzberg/result.rb +12 -2
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +1 -0
data/sig/kreuzberg.rbs +23 -11
data/spec/binding/batch_spec.rb +6 -5
data/spec/binding/config_spec.rb +1 -1
data/spec/binding/error_recovery_spec.rb +3 -3
data/spec/binding/tables_spec.rb +11 -2
data/spec/unit/config/extraction_config_spec.rb +2 -2
data/spec/unit/config/output_format_spec.rb +18 -18
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +3 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +60 -0
data/vendor/kreuzberg/src/api/handlers.rs +153 -32
data/vendor/kreuzberg/src/api/mod.rs +2 -0
data/vendor/kreuzberg/src/api/openapi.rs +141 -0
data/vendor/kreuzberg/src/api/router.rs +24 -2
data/vendor/kreuzberg/src/api/startup.rs +21 -1
data/vendor/kreuzberg/src/api/types.rs +50 -4
data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
data/vendor/kreuzberg/src/core/io.rs +7 -7
data/vendor/kreuzberg/src/core/mime.rs +4 -4
data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
data/vendor/kreuzberg/tests/core_integration.rs +2 -4
data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
data/vendor/kreuzberg-ffi/src/types.rs +8 -5
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +5 -2

data/vendor/kreuzberg/src/api/types.rs CHANGED Viewed

@@ -109,19 +109,41 @@ impl ApiSizeLimits {
     }
 }
+/// Plugin status information in health response.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
+pub struct PluginStatus {
+    /// Number of registered OCR backends
+    pub ocr_backends_count: usize,
+    /// Names of registered OCR backends
+    pub ocr_backends: Vec<String>,
+    /// Number of registered document extractors
+    pub extractors_count: usize,
+    /// Number of registered post-processors
+    pub post_processors_count: usize,
+}
 /// Health check response.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct HealthResponse {
     /// Health status
+    #[cfg_attr(feature = "api", schema(example = "healthy"))]
     pub status: String,
     /// API version
+    #[cfg_attr(feature = "api", schema(example = "0.8.0"))]
     pub version: String,
+    /// Plugin status (optional)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub plugins: Option<PluginStatus>,
 }
 /// Server information response.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct InfoResponse {
     /// API version
+    #[cfg_attr(feature = "api", schema(example = "0.8.0"))]
     pub version: String,
     /// Whether using Rust backend
     pub rust_backend: bool,
@@ -132,15 +154,19 @@ pub type ExtractResponse = Vec<ExtractionResult>;
 /// Error response.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ErrorResponse {
     /// Error type name
+    #[cfg_attr(feature = "api", schema(example = "ValidationError"))]
     pub error_type: String,
     /// Error message
+    #[cfg_attr(feature = "api", schema(example = "Invalid input provided"))]
     pub message: String,
     /// Stack trace (if available)
     #[serde(skip_serializing_if = "Option::is_none")]
     pub traceback: Option<String>,
     /// HTTP status code
+    #[cfg_attr(feature = "api", schema(example = 400))]
     pub status_code: u16,
 }
@@ -156,8 +182,10 @@ pub struct ApiState {
 /// Cache statistics response.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct CacheStatsResponse {
     /// Cache directory path
+    #[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
     pub directory: String,
     /// Total number of cache files
     pub total_files: usize,
@@ -173,8 +201,10 @@ pub struct CacheStatsResponse {
 /// Cache clear response.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct CacheClearResponse {
     /// Cache directory path
+    #[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
     pub directory: String,
     /// Number of files removed
     pub removed_files: usize,
@@ -184,20 +214,25 @@ pub struct CacheClearResponse {
 /// Embedding request for generating embeddings from text.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct EmbedRequest {
-    /// Text strings to generate embeddings for
+    /// Text strings to generate embeddings for (at least one non-empty string required)
+    #[cfg_attr(feature = "api", schema(min_items = 1))]
     pub texts: Vec<String>,
     /// Optional embedding configuration (model, batch size, etc.)
     #[serde(skip_serializing_if = "Option::is_none")]
+    #[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
     pub config: Option<crate::core::config::EmbeddingConfig>,
 }
 /// Embedding response containing generated embeddings.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct EmbedResponse {
     /// Generated embeddings (one per input text)
     pub embeddings: Vec<Vec<f32>>,
     /// Model used for embedding generation
+    #[cfg_attr(feature = "api", schema(example = "all-MiniLM-L6-v2"))]
     pub model: String,
     /// Dimensionality of the embeddings
     pub dimensions: usize,
@@ -212,23 +247,29 @@ fn default_chunker_type() -> String {
 /// Chunk request with text and configuration.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ChunkRequest {
-    /// Text to chunk
+    /// Text to chunk (must not be empty)
+    #[cfg_attr(feature = "api", schema(example = "This is sample text to chunk.", min_length = 1))]
     pub text: String,
     /// Optional chunking configuration
     #[serde(skip_serializing_if = "Option::is_none")]
     pub config: Option<ChunkingConfigRequest>,
     /// Chunker type (text or markdown)
     #[serde(default = "default_chunker_type")]
+    #[cfg_attr(feature = "api", schema(example = "text", pattern = "^(text|markdown)$"))]
     pub chunker_type: String,
 }
 /// Chunking configuration request.
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ChunkingConfigRequest {
-    /// Maximum characters per chunk
+    /// Maximum characters per chunk (must be greater than overlap, default: 2000)
+    #[cfg_attr(feature = "api", schema(minimum = 101, example = 2000))]
     pub max_characters: Option<usize>,
-    /// Overlap between chunks in characters
+    /// Overlap between chunks in characters (must be less than max_characters, default: 100)
+    #[cfg_attr(feature = "api", schema(minimum = 0, maximum = 1999, example = 100))]
     pub overlap: Option<usize>,
     /// Whether to trim whitespace
     pub trim: Option<bool>,
@@ -236,6 +277,7 @@ pub struct ChunkingConfigRequest {
 /// Chunk response with chunks and metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ChunkResponse {
     /// List of chunks
     pub chunks: Vec<ChunkItem>,
@@ -246,11 +288,13 @@ pub struct ChunkResponse {
     /// Input text size in bytes
     pub input_size_bytes: usize,
     /// Chunker type used for chunking
+    #[cfg_attr(feature = "api", schema(example = "text"))]
     pub chunker_type: String,
 }
 /// Individual chunk item with metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ChunkItem {
     /// Chunk content
     pub content: String,
@@ -272,6 +316,7 @@ pub struct ChunkItem {
 /// Chunking configuration response.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ChunkingConfigResponse {
     /// Maximum characters per chunk
     pub max_characters: usize,
@@ -280,5 +325,6 @@ pub struct ChunkingConfigResponse {
     /// Whether whitespace was trimmed
     pub trim: bool,
     /// Type of chunker used
+    #[cfg_attr(feature = "api", schema(example = "text"))]
     pub chunker_type: String,
 }

data/vendor/kreuzberg/src/core/config/processing.rs CHANGED Viewed

@@ -84,7 +84,8 @@ pub struct ChunkingConfig {
 /// Requires the `embeddings` feature to be enabled.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct EmbeddingConfig {
-    /// The embedding model to use
+    /// The embedding model to use (defaults to "balanced" preset if not specified)
+    #[serde(default = "default_model")]
     pub model: EmbeddingModelType,
     /// Whether to normalize embedding vectors (recommended for cosine similarity)
@@ -156,6 +157,12 @@ fn default_batch_size() -> usize {
     32
 }
+fn default_model() -> EmbeddingModelType {
+    EmbeddingModelType::Preset {
+        name: "balanced".to_string(),
+    }
+}
 #[cfg(test)]
 mod tests {
     use super::*;

data/vendor/kreuzberg/src/core/config_validation/sections.rs CHANGED Viewed

@@ -30,8 +30,10 @@ const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
 /// Valid tesseract OEM (OCR Engine Mode) values.
 const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
-/// Valid output formats for tesseract.
-const VALID_OUTPUT_FORMATS: &[&str] = &["text", "markdown"];
+/// Valid output formats for document extraction.
+/// Supports plain text, markdown, djot, and HTML output formats.
+/// Also accepts aliases: "text" for "plain", "md" for "markdown".
+const VALID_OUTPUT_FORMATS: &[&str] = &["plain", "text", "markdown", "md", "djot", "html"];
 /// Validate a binarization method string.
 ///
@@ -248,11 +250,17 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
     }
 }
-/// Validate a tesseract output format.
+/// Validate a document extraction output format.
+///
+/// Accepts the following formats and aliases:
+/// - "plain" or "text" for plain text output
+/// - "markdown" or "md" for Markdown output
+/// - "djot" for Djot markup format
+/// - "html" for HTML output
 ///
 /// # Arguments
 ///
-/// * `format` - The output format to validate (e.g., "text", "markdown")
+/// * `format` - The output format to validate
 ///
 /// # Returns
 ///
@@ -264,7 +272,11 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
 /// use kreuzberg::core::config_validation::validate_output_format;
 ///
 /// assert!(validate_output_format("text").is_ok());
+/// assert!(validate_output_format("plain").is_ok());
 /// assert!(validate_output_format("markdown").is_ok());
+/// assert!(validate_output_format("md").is_ok());
+/// assert!(validate_output_format("djot").is_ok());
+/// assert!(validate_output_format("html").is_ok());
 /// assert!(validate_output_format("json").is_err());
 /// ```
 pub fn validate_output_format(format: &str) -> Result<()> {

data/vendor/kreuzberg/src/core/extractor/file.rs CHANGED Viewed

@@ -106,9 +106,8 @@ pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
 ///
 /// # Errors
 ///
-/// Returns `KreuzbergError::Validation` if the file doesn't exist or path is invalid.
+/// Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
 /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
-/// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
 ///
 /// # Example
 ///

data/vendor/kreuzberg/src/core/extractor/mod.rs CHANGED Viewed

@@ -411,7 +411,8 @@ mod tests {
         assert!(result.is_err());
         use crate::KreuzbergError;
-        assert!(matches!(result.unwrap_err(), KreuzbergError::Validation { .. }));
+        // File validation returns Io error, not Validation error
+        assert!(matches!(result.unwrap_err(), KreuzbergError::Io { .. }));
     }
     #[test]

data/vendor/kreuzberg/src/core/io.rs CHANGED Viewed

@@ -61,12 +61,12 @@ pub fn file_exists(path: impl AsRef<Path>) -> bool {
 ///
 /// # Errors
 ///
-/// Returns `KreuzbergError::Validation` if file doesn't exist.
+/// Returns `KreuzbergError::Io` if file doesn't exist.
 pub fn validate_file_exists(path: impl AsRef<Path>) -> Result<()> {
     if !file_exists(&path) {
-        return Err(KreuzbergError::validation(format!(
-            "File does not exist: {}",
-            path.as_ref().display()
+        return Err(KreuzbergError::from(std::io::Error::new(
+            std::io::ErrorKind::NotFound,
+            format!("File does not exist: {}", path.as_ref().display()),
         )));
     }
     Ok(())
@@ -99,9 +99,9 @@ where
     let mut files = Vec::new();
     if !dir.is_dir() {
-        return Err(KreuzbergError::validation(format!(
-            "Path is not a directory: {}",
-            dir.display()
+        return Err(KreuzbergError::from(std::io::Error::new(
+            std::io::ErrorKind::NotADirectory,
+            format!("Path is not a directory: {}", dir.display()),
         )));
     }

data/vendor/kreuzberg/src/core/mime.rs CHANGED Viewed

@@ -231,15 +231,15 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
 ///
 /// # Errors
 ///
-/// Returns `KreuzbergError::Validation` if file doesn't exist (when `check_exists` is true).
+/// Returns `KreuzbergError::Io` if file doesn't exist (when `check_exists` is true).
 /// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
 pub fn detect_mime_type(path: impl AsRef<Path>, check_exists: bool) -> Result<String> {
     let path = path.as_ref();
     if check_exists && !path.exists() {
-        return Err(KreuzbergError::validation(format!(
-            "File does not exist: {}",
-            path.display()
+        return Err(KreuzbergError::from(std::io::Error::new(
+            std::io::ErrorKind::NotFound,
+            format!("File does not exist: {}", path.display()),
         )));
     }

data/vendor/kreuzberg/src/extraction/excel.rs CHANGED Viewed

@@ -27,16 +27,23 @@
 //! # Ok(())
 //! # }
 //! ```
-use calamine::{Data, Range, Reader, open_workbook_auto};
+use calamine::{Data, DataRef, Range, Reader, open_workbook_auto};
 use std::collections::HashMap;
 use std::fmt::Write as FmtWrite;
-use std::io::Cursor;
+use std::io::{Cursor, Read, Seek};
 use std::path::Path;
 use crate::error::{KreuzbergError, Result};
 use crate::extraction::capacity;
 use crate::types::{ExcelSheet, ExcelWorkbook};
+/// Maximum number of cells in a Range's bounding box before we consider it pathological.
+/// This threshold is set to prevent OOM when processing files with sparse data at extreme
+/// positions (e.g., Excel Solver files that have cells at A1 and XFD1048575).
+///
+/// 100 million cells at ~64 bytes each = ~6.4 GB, which is a reasonable upper limit.
+const MAX_BOUNDING_BOX_CELLS: u64 = 100_000_000;
 #[cfg(feature = "office")]
 use crate::extraction::office_metadata::{
     extract_core_properties, extract_custom_properties, extract_xlsx_app_properties,
@@ -45,11 +52,13 @@ use crate::extraction::office_metadata::{
 use serde_json::Value;
 pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
+    let lower_path = file_path.to_lowercase();
     #[cfg(feature = "office")]
-    let office_metadata = if file_path.to_lowercase().ends_with(".xlsx")
-        || file_path.to_lowercase().ends_with(".xlsm")
-        || file_path.to_lowercase().ends_with(".xlam")
-        || file_path.to_lowercase().ends_with(".xltm")
+    let office_metadata = if lower_path.ends_with(".xlsx")
+        || lower_path.ends_with(".xlsm")
+        || lower_path.ends_with(".xlam")
+        || lower_path.ends_with(".xltm")
     {
         extract_xlsx_office_metadata_from_file(file_path).ok()
     } else {
@@ -59,7 +68,19 @@ pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
     #[cfg(not(feature = "office"))]
     let office_metadata: Option<HashMap<String, String>> = None;
-    // We analyze the error and only wrap format errors, letting real IO errors bubble up ~keep
+    // For XLSX files, use specialized handler with OOM protection
+    if lower_path.ends_with(".xlsx")
+        || lower_path.ends_with(".xlsm")
+        || lower_path.ends_with(".xlam")
+        || lower_path.ends_with(".xltm")
+    {
+        let file = std::fs::File::open(file_path)?;
+        let workbook = calamine::Xlsx::new(std::io::BufReader::new(file))
+            .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
+        return process_xlsx_workbook(workbook, office_metadata);
+    }
+    // For other formats, use open_workbook_auto
     let workbook = match open_workbook_auto(Path::new(file_path)) {
         Ok(wb) => wb,
         Err(calamine::Error::Io(io_err)) => {
@@ -94,7 +115,7 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
         ".xlsx" | ".xlsm" | ".xlam" | ".xltm" => {
             let workbook = calamine::Xlsx::new(cursor)
                 .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
-            process_workbook(workbook, office_metadata)
+            process_xlsx_workbook(workbook, office_metadata)
         }
         ".xls" | ".xla" => {
             let workbook = calamine::Xls::new(cursor)
@@ -118,6 +139,194 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
     }
 }
+/// Process XLSX workbooks with special handling for pathological sparse files.
+///
+/// This function uses calamine's `worksheet_cells_reader()` API to detect sheets with
+/// extreme bounding boxes BEFORE allocating memory for the full Range. This prevents
+/// OOM when processing files like Excel Solver files that have cells at both A1 and
+/// XFD1048575, creating a bounding box of ~17 billion cells.
+fn process_xlsx_workbook<RS: Read + Seek>(
+    mut workbook: calamine::Xlsx<RS>,
+    office_metadata: Option<HashMap<String, String>>,
+) -> Result<ExcelWorkbook> {
+    let sheet_names = workbook.sheet_names();
+    let mut sheets = Vec::with_capacity(sheet_names.len());
+    for name in &sheet_names {
+        // Use worksheet_cells_reader to stream cells and detect pathological bounding boxes
+        match process_xlsx_sheet_safe(&mut workbook, name) {
+            Ok(sheet) => sheets.push(sheet),
+            Err(e) => {
+                // Log but don't fail - continue with other sheets
+                tracing::warn!("Failed to process sheet '{}': {}", name, e);
+            }
+        }
+    }
+    let metadata = extract_metadata(&workbook, &sheet_names, office_metadata);
+    Ok(ExcelWorkbook { sheets, metadata })
+}
+/// Process a single XLSX sheet safely by pre-checking the bounding box.
+///
+/// This function streams cells to compute the actual bounding box without allocating
+/// a full Range, then only creates the Range if the bounding box is within safe limits.
+fn process_xlsx_sheet_safe<RS: Read + Seek>(workbook: &mut calamine::Xlsx<RS>, sheet_name: &str) -> Result<ExcelSheet> {
+    // First pass: stream cells to compute actual bounding box and collect cell data
+    let (cells, row_min, row_max, col_min, col_max) = {
+        let mut cell_reader = workbook
+            .worksheet_cells_reader(sheet_name)
+            .map_err(|e| KreuzbergError::parsing(format!("Failed to read sheet '{}': {}", sheet_name, e)))?;
+        let mut cells: Vec<((u32, u32), Data)> = Vec::new();
+        let mut row_min = u32::MAX;
+        let mut row_max = 0u32;
+        let mut col_min = u32::MAX;
+        let mut col_max = 0u32;
+        // Stream through all cells, tracking bounds
+        while let Ok(Some(cell)) = cell_reader.next_cell() {
+            let (row, col) = cell.get_position();
+            row_min = row_min.min(row);
+            row_max = row_max.max(row);
+            col_min = col_min.min(col);
+            col_max = col_max.max(col);
+            // Convert DataRef to owned Data
+            let data: Data = match cell.get_value() {
+                DataRef::Empty => Data::Empty,
+                DataRef::String(s) => Data::String(s.clone()),
+                DataRef::SharedString(s) => Data::String(s.to_string()),
+                DataRef::Float(f) => Data::Float(*f),
+                DataRef::Int(i) => Data::Int(*i),
+                DataRef::Bool(b) => Data::Bool(*b),
+                DataRef::DateTime(dt) => Data::DateTime(*dt),
+                DataRef::DateTimeIso(s) => Data::DateTimeIso(s.clone()),
+                DataRef::DurationIso(s) => Data::DurationIso(s.clone()),
+                DataRef::Error(e) => Data::Error(e.clone()),
+            };
+            cells.push(((row, col), data));
+        }
+        (cells, row_min, row_max, col_min, col_max)
+    }; // cell_reader is dropped here, releasing the borrow
+    // Check if sheet is empty
+    if cells.is_empty() {
+        return Ok(ExcelSheet {
+            name: sheet_name.to_owned(),
+            markdown: format!("## {}\n\n*Empty sheet*", sheet_name),
+            row_count: 0,
+            col_count: 0,
+            cell_count: 0,
+            table_cells: None,
+        });
+    }
+    // Calculate bounding box size
+    let bb_rows = (row_max - row_min + 1) as u64;
+    let bb_cols = (col_max - col_min + 1) as u64;
+    let bb_cells = bb_rows.saturating_mul(bb_cols);
+    // Check for pathological bounding box
+    if bb_cells > MAX_BOUNDING_BOX_CELLS {
+        // Sheet has sparse data at extreme positions - process directly from cells
+        return process_sparse_sheet_from_cells(sheet_name, cells, row_min, row_max, col_min, col_max);
+    }
+    // Safe to create a Range - bounding box is within limits
+    // Use calamine's normal worksheet_range which will create the Range
+    let range = workbook
+        .worksheet_range(sheet_name)
+        .map_err(|e| KreuzbergError::parsing(format!("Failed to parse sheet '{}': {}", sheet_name, e)))?;
+    Ok(process_sheet(sheet_name, &range))
+}
+/// Process a sparse sheet directly from collected cells without creating a full Range.
+///
+/// This is used when the bounding box would exceed MAX_BOUNDING_BOX_CELLS.
+/// Instead of creating a dense Range, we generate markdown directly from the sparse cells.
+fn process_sparse_sheet_from_cells(
+    sheet_name: &str,
+    cells: Vec<((u32, u32), Data)>,
+    row_min: u32,
+    row_max: u32,
+    col_min: u32,
+    col_max: u32,
+) -> Result<ExcelSheet> {
+    let cell_count = cells.len();
+    let bb_rows = (row_max - row_min + 1) as usize;
+    let bb_cols = (col_max - col_min + 1) as usize;
+    // Create a warning message about the sparse data
+    let mut markdown = String::with_capacity(500 + cell_count * 50);
+    write!(
+        markdown,
+        "## {}\n\n*Note: Sheet contains sparse data spanning {} rows x {} columns ({} actual cells). \
+         Bounding box too large for dense extraction. Showing actual cell data below.*\n\n",
+        sheet_name, bb_rows, bb_cols, cell_count
+    )
+    .expect("write to String cannot fail");
+    // Group cells by row for tabular display
+    let mut cells_by_row: HashMap<u32, Vec<(u32, &Data)>> = HashMap::new();
+    for ((row, col), data) in &cells {
+        cells_by_row.entry(*row).or_default().push((*col, data));
+    }
+    // Sort rows and output as simple key-value pairs
+    let mut rows: Vec<_> = cells_by_row.keys().copied().collect();
+    rows.sort_unstable();
+    // Limit output to first 1000 cells to avoid huge output
+    let mut output_count = 0;
+    const MAX_OUTPUT_CELLS: usize = 1000;
+    for row in rows {
+        if output_count >= MAX_OUTPUT_CELLS {
+            write!(markdown, "\n... ({} more cells not shown)\n", cell_count - output_count)
+                .expect("write to String cannot fail");
+            break;
+        }
+        let mut row_cells = cells_by_row.remove(&row).unwrap_or_default();
+        row_cells.sort_by_key(|(col, _)| *col);
+        for (col, data) in row_cells {
+            if output_count >= MAX_OUTPUT_CELLS {
+                break;
+            }
+            let cell_ref = col_to_excel_letter(col);
+            let cell_str = format_cell_to_string(data);
+            if !cell_str.is_empty() {
+                writeln!(markdown, "- **{}{}**: {}", cell_ref, row + 1, cell_str).expect("write to String cannot fail");
+                output_count += 1;
+            }
+        }
+    }
+    Ok(ExcelSheet {
+        name: sheet_name.to_owned(),
+        markdown,
+        row_count: bb_rows,
+        col_count: bb_cols,
+        cell_count,
+        table_cells: None, // No structured table for sparse sheets
+    })
+}
+/// Convert a 0-indexed column number to Excel-style letter(s) (A, B, ..., Z, AA, AB, ...).
+fn col_to_excel_letter(col: u32) -> String {
+    let mut result = String::new();
+    let mut n = col + 1; // 1-indexed for calculation
+    while n > 0 {
+        n -= 1;
+        result.insert(0, (b'A' + (n % 26) as u8) as char);
+        n /= 26;
+    }
+    result
+}
 fn process_workbook<RS, R>(mut workbook: R, office_metadata: Option<HashMap<String, String>>) -> Result<ExcelWorkbook>
 where
     RS: std::io::Read + std::io::Seek,
@@ -143,7 +352,10 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
     let (rows, cols) = range.get_size();
     let cell_count = range.used_cells().count();
-    let estimated_capacity = 50 + (cols * 20) + (rows * cols * 12);
+    // Fix for issue #331: Use actual cell count instead of declared dimensions
+    // to avoid OOM on sparse sheets with extreme dimensions (e.g., Excel Solver files).
+    // Declared dimensions can claim A1:XFD1048575 (~17T cells) while actual data is minimal.
+    let estimated_capacity = 50 + (cols * 20) + (cell_count * 12);
     if rows == 0 || cols == 0 {
         let markdown = format!("## {}\n\n*Empty sheet*", name);
@@ -176,6 +388,31 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
 ///
 /// Returns (markdown, table_cells) where table_cells is a 2D vector of strings.
 fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity: usize) -> (String, Vec<Vec<String>>) {
+    // Fix for issue #331: Protect against extreme declared dimensions.
+    // Excel Solver files can declare A1:XFD1048575 (1M+ rows) but only have ~26 actual cells.
+    // Calling range.rows().collect() would iterate ALL declared rows causing OOM.
+    const MAX_REASONABLE_ROWS: usize = 100_000; // Cap at 100K rows for safety
+    let (declared_rows, _declared_cols) = range.get_size();
+    // If declared rows exceed reasonable limit, skip processing to avoid OOM
+    if declared_rows > MAX_REASONABLE_ROWS {
+        let actual_cell_count = range.used_cells().count();
+        // If actual data is minimal compared to declared size, it's a sparse/pathological file
+        if actual_cell_count < 10_000 {
+            // Return minimal output instead of OOM
+            let result_capacity = 100 + sheet_name.len();
+            let mut result = String::with_capacity(result_capacity);
+            write!(
+                result,
+                "## {}\n\n*Sheet has extreme declared dimensions ({} rows) with minimal actual data ({} cells). Skipping to prevent OOM.*",
+                sheet_name, declared_rows, actual_cell_count
+            ).unwrap();
+            return (result, Vec::new());
+        }
+    }
     let rows: Vec<_> = range.rows().collect();
     if rows.is_empty() {
         let result_capacity = 50 + sheet_name.len();

data/vendor/kreuzberg/src/extraction/pptx/parser.rs CHANGED Viewed

@@ -384,5 +384,11 @@ pub(super) fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
         }
     }
+    // Sort slide paths to ensure correct ordering regardless of XML order.
+    // PowerPoint doesn't guarantee relationship order in the rels file.
+    // GitHub Issue #329: Without sorting, slides can be processed in wrong order,
+    // causing images to have incorrect page numbers.
+    slide_paths.sort();
     Ok(slide_paths)
 }

data/vendor/kreuzberg/src/plugins/mod.rs CHANGED Viewed

@@ -206,6 +206,7 @@ mod extractor;
 mod ocr;
 mod processor;
 pub mod registry;
+pub mod startup_validation;
 mod traits;
 mod validator;