kreuzberg 4.2.1 → 4.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +9 -9
  3. data/README.md +1 -1
  4. data/lib/kreuzberg/api_proxy.rb +3 -3
  5. data/lib/kreuzberg/cli_proxy.rb +2 -2
  6. data/lib/kreuzberg/config.rb +4 -20
  7. data/lib/kreuzberg/mcp_proxy.rb +3 -3
  8. data/lib/kreuzberg/version.rb +1 -1
  9. data/spec/binding/config_spec.rb +1 -1
  10. data/spec/unit/config/extraction_config_spec.rb +2 -2
  11. data/vendor/Cargo.toml +1 -1
  12. data/vendor/kreuzberg/Cargo.toml +3 -2
  13. data/vendor/kreuzberg/README.md +1 -1
  14. data/vendor/kreuzberg/src/api/error.rs +89 -0
  15. data/vendor/kreuzberg/src/api/handlers.rs +153 -32
  16. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/api/openapi.rs +141 -0
  18. data/vendor/kreuzberg/src/api/router.rs +24 -2
  19. data/vendor/kreuzberg/src/api/startup.rs +11 -5
  20. data/vendor/kreuzberg/src/api/types.rs +50 -4
  21. data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
  22. data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
  23. data/vendor/kreuzberg/src/mcp/format.rs +46 -57
  24. data/vendor/kreuzberg/src/mcp/server.rs +2 -8
  25. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +1 -7
  26. data/vendor/kreuzberg/tests/api_chunk.rs +25 -0
  27. data/vendor/kreuzberg/tests/api_embed.rs +60 -0
  28. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
  29. data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
  30. data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
  31. data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
  32. data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
  33. data/vendor/kreuzberg-ffi/src/types.rs +8 -5
  34. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  35. metadata +3 -2
@@ -109,19 +109,41 @@ impl ApiSizeLimits {
109
109
  }
110
110
  }
111
111
 
112
+ /// Plugin status information in health response.
113
+ #[derive(Debug, Clone, Serialize, Deserialize)]
114
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
115
+ pub struct PluginStatus {
116
+ /// Number of registered OCR backends
117
+ pub ocr_backends_count: usize,
118
+ /// Names of registered OCR backends
119
+ pub ocr_backends: Vec<String>,
120
+ /// Number of registered document extractors
121
+ pub extractors_count: usize,
122
+ /// Number of registered post-processors
123
+ pub post_processors_count: usize,
124
+ }
125
+
112
126
  /// Health check response.
113
127
  #[derive(Debug, Clone, Serialize, Deserialize)]
128
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
114
129
  pub struct HealthResponse {
115
130
  /// Health status
131
+ #[cfg_attr(feature = "api", schema(example = "healthy"))]
116
132
  pub status: String,
117
133
  /// API version
134
+ #[cfg_attr(feature = "api", schema(example = "0.8.0"))]
118
135
  pub version: String,
136
+ /// Plugin status (optional)
137
+ #[serde(skip_serializing_if = "Option::is_none")]
138
+ pub plugins: Option<PluginStatus>,
119
139
  }
120
140
 
121
141
  /// Server information response.
122
142
  #[derive(Debug, Clone, Serialize, Deserialize)]
143
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
123
144
  pub struct InfoResponse {
124
145
  /// API version
146
+ #[cfg_attr(feature = "api", schema(example = "0.8.0"))]
125
147
  pub version: String,
126
148
  /// Whether using Rust backend
127
149
  pub rust_backend: bool,
@@ -132,15 +154,19 @@ pub type ExtractResponse = Vec<ExtractionResult>;
132
154
 
133
155
  /// Error response.
134
156
  #[derive(Debug, Clone, Serialize, Deserialize)]
157
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
135
158
  pub struct ErrorResponse {
136
159
  /// Error type name
160
+ #[cfg_attr(feature = "api", schema(example = "ValidationError"))]
137
161
  pub error_type: String,
138
162
  /// Error message
163
+ #[cfg_attr(feature = "api", schema(example = "Invalid input provided"))]
139
164
  pub message: String,
140
165
  /// Stack trace (if available)
141
166
  #[serde(skip_serializing_if = "Option::is_none")]
142
167
  pub traceback: Option<String>,
143
168
  /// HTTP status code
169
+ #[cfg_attr(feature = "api", schema(example = 400))]
144
170
  pub status_code: u16,
145
171
  }
146
172
 
@@ -156,8 +182,10 @@ pub struct ApiState {
156
182
 
157
183
  /// Cache statistics response.
158
184
  #[derive(Debug, Clone, Serialize, Deserialize)]
185
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
159
186
  pub struct CacheStatsResponse {
160
187
  /// Cache directory path
188
+ #[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
161
189
  pub directory: String,
162
190
  /// Total number of cache files
163
191
  pub total_files: usize,
@@ -173,8 +201,10 @@ pub struct CacheStatsResponse {
173
201
 
174
202
  /// Cache clear response.
175
203
  #[derive(Debug, Clone, Serialize, Deserialize)]
204
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
176
205
  pub struct CacheClearResponse {
177
206
  /// Cache directory path
207
+ #[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
178
208
  pub directory: String,
179
209
  /// Number of files removed
180
210
  pub removed_files: usize,
@@ -184,20 +214,25 @@ pub struct CacheClearResponse {
184
214
 
185
215
  /// Embedding request for generating embeddings from text.
186
216
  #[derive(Debug, Clone, Serialize, Deserialize)]
217
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
187
218
  pub struct EmbedRequest {
188
- /// Text strings to generate embeddings for
219
+ /// Text strings to generate embeddings for (at least one non-empty string required)
220
+ #[cfg_attr(feature = "api", schema(min_items = 1))]
189
221
  pub texts: Vec<String>,
190
222
  /// Optional embedding configuration (model, batch size, etc.)
191
223
  #[serde(skip_serializing_if = "Option::is_none")]
224
+ #[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
192
225
  pub config: Option<crate::core::config::EmbeddingConfig>,
193
226
  }
194
227
 
195
228
  /// Embedding response containing generated embeddings.
196
229
  #[derive(Debug, Clone, Serialize, Deserialize)]
230
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
197
231
  pub struct EmbedResponse {
198
232
  /// Generated embeddings (one per input text)
199
233
  pub embeddings: Vec<Vec<f32>>,
200
234
  /// Model used for embedding generation
235
+ #[cfg_attr(feature = "api", schema(example = "all-MiniLM-L6-v2"))]
201
236
  pub model: String,
202
237
  /// Dimensionality of the embeddings
203
238
  pub dimensions: usize,
@@ -212,23 +247,29 @@ fn default_chunker_type() -> String {
212
247
 
213
248
  /// Chunk request with text and configuration.
214
249
  #[derive(Debug, Clone, Serialize, Deserialize)]
250
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
215
251
  pub struct ChunkRequest {
216
- /// Text to chunk
252
+ /// Text to chunk (must not be empty)
253
+ #[cfg_attr(feature = "api", schema(example = "This is sample text to chunk.", min_length = 1))]
217
254
  pub text: String,
218
255
  /// Optional chunking configuration
219
256
  #[serde(skip_serializing_if = "Option::is_none")]
220
257
  pub config: Option<ChunkingConfigRequest>,
221
258
  /// Chunker type (text or markdown)
222
259
  #[serde(default = "default_chunker_type")]
260
+ #[cfg_attr(feature = "api", schema(example = "text", pattern = "^(text|markdown)$"))]
223
261
  pub chunker_type: String,
224
262
  }
225
263
 
226
264
  /// Chunking configuration request.
227
265
  #[derive(Debug, Clone, Default, Serialize, Deserialize)]
266
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
228
267
  pub struct ChunkingConfigRequest {
229
- /// Maximum characters per chunk
268
+ /// Maximum characters per chunk (must be greater than overlap, default: 2000)
269
+ #[cfg_attr(feature = "api", schema(minimum = 101, example = 2000))]
230
270
  pub max_characters: Option<usize>,
231
- /// Overlap between chunks in characters
271
+ /// Overlap between chunks in characters (must be less than max_characters, default: 100)
272
+ #[cfg_attr(feature = "api", schema(minimum = 0, maximum = 1999, example = 100))]
232
273
  pub overlap: Option<usize>,
233
274
  /// Whether to trim whitespace
234
275
  pub trim: Option<bool>,
@@ -236,6 +277,7 @@ pub struct ChunkingConfigRequest {
236
277
 
237
278
  /// Chunk response with chunks and metadata.
238
279
  #[derive(Debug, Clone, Serialize, Deserialize)]
280
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
239
281
  pub struct ChunkResponse {
240
282
  /// List of chunks
241
283
  pub chunks: Vec<ChunkItem>,
@@ -246,11 +288,13 @@ pub struct ChunkResponse {
246
288
  /// Input text size in bytes
247
289
  pub input_size_bytes: usize,
248
290
  /// Chunker type used for chunking
291
+ #[cfg_attr(feature = "api", schema(example = "text"))]
249
292
  pub chunker_type: String,
250
293
  }
251
294
 
252
295
  /// Individual chunk item with metadata.
253
296
  #[derive(Debug, Clone, Serialize, Deserialize)]
297
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
254
298
  pub struct ChunkItem {
255
299
  /// Chunk content
256
300
  pub content: String,
@@ -272,6 +316,7 @@ pub struct ChunkItem {
272
316
 
273
317
  /// Chunking configuration response.
274
318
  #[derive(Debug, Clone, Serialize, Deserialize)]
319
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
275
320
  pub struct ChunkingConfigResponse {
276
321
  /// Maximum characters per chunk
277
322
  pub max_characters: usize,
@@ -280,5 +325,6 @@ pub struct ChunkingConfigResponse {
280
325
  /// Whether whitespace was trimmed
281
326
  pub trim: bool,
282
327
  /// Type of chunker used
328
+ #[cfg_attr(feature = "api", schema(example = "text"))]
283
329
  pub chunker_type: String,
284
330
  }
@@ -84,7 +84,8 @@ pub struct ChunkingConfig {
84
84
  /// Requires the `embeddings` feature to be enabled.
85
85
  #[derive(Debug, Clone, Serialize, Deserialize)]
86
86
  pub struct EmbeddingConfig {
87
- /// The embedding model to use
87
+ /// The embedding model to use (defaults to "balanced" preset if not specified)
88
+ #[serde(default = "default_model")]
88
89
  pub model: EmbeddingModelType,
89
90
 
90
91
  /// Whether to normalize embedding vectors (recommended for cosine similarity)
@@ -156,6 +157,12 @@ fn default_batch_size() -> usize {
156
157
  32
157
158
  }
158
159
 
160
+ fn default_model() -> EmbeddingModelType {
161
+ EmbeddingModelType::Preset {
162
+ name: "balanced".to_string(),
163
+ }
164
+ }
165
+
159
166
  #[cfg(test)]
160
167
  mod tests {
161
168
  use super::*;
@@ -27,16 +27,23 @@
27
27
  //! # Ok(())
28
28
  //! # }
29
29
  //! ```
30
- use calamine::{Data, Range, Reader, open_workbook_auto};
30
+ use calamine::{Data, DataRef, Range, Reader, open_workbook_auto};
31
31
  use std::collections::HashMap;
32
32
  use std::fmt::Write as FmtWrite;
33
- use std::io::Cursor;
33
+ use std::io::{Cursor, Read, Seek};
34
34
  use std::path::Path;
35
35
 
36
36
  use crate::error::{KreuzbergError, Result};
37
37
  use crate::extraction::capacity;
38
38
  use crate::types::{ExcelSheet, ExcelWorkbook};
39
39
 
40
+ /// Maximum number of cells in a Range's bounding box before we consider it pathological.
41
+ /// This threshold is set to prevent OOM when processing files with sparse data at extreme
42
+ /// positions (e.g., Excel Solver files that have cells at A1 and XFD1048575).
43
+ ///
44
+ /// 100 million cells at ~64 bytes each = ~6.4 GB, which is a reasonable upper limit.
45
+ const MAX_BOUNDING_BOX_CELLS: u64 = 100_000_000;
46
+
40
47
  #[cfg(feature = "office")]
41
48
  use crate::extraction::office_metadata::{
42
49
  extract_core_properties, extract_custom_properties, extract_xlsx_app_properties,
@@ -45,11 +52,13 @@ use crate::extraction::office_metadata::{
45
52
  use serde_json::Value;
46
53
 
47
54
  pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
55
+ let lower_path = file_path.to_lowercase();
56
+
48
57
  #[cfg(feature = "office")]
49
- let office_metadata = if file_path.to_lowercase().ends_with(".xlsx")
50
- || file_path.to_lowercase().ends_with(".xlsm")
51
- || file_path.to_lowercase().ends_with(".xlam")
52
- || file_path.to_lowercase().ends_with(".xltm")
58
+ let office_metadata = if lower_path.ends_with(".xlsx")
59
+ || lower_path.ends_with(".xlsm")
60
+ || lower_path.ends_with(".xlam")
61
+ || lower_path.ends_with(".xltm")
53
62
  {
54
63
  extract_xlsx_office_metadata_from_file(file_path).ok()
55
64
  } else {
@@ -59,7 +68,19 @@ pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
59
68
  #[cfg(not(feature = "office"))]
60
69
  let office_metadata: Option<HashMap<String, String>> = None;
61
70
 
62
- // We analyze the error and only wrap format errors, letting real IO errors bubble up ~keep
71
+ // For XLSX files, use specialized handler with OOM protection
72
+ if lower_path.ends_with(".xlsx")
73
+ || lower_path.ends_with(".xlsm")
74
+ || lower_path.ends_with(".xlam")
75
+ || lower_path.ends_with(".xltm")
76
+ {
77
+ let file = std::fs::File::open(file_path)?;
78
+ let workbook = calamine::Xlsx::new(std::io::BufReader::new(file))
79
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
80
+ return process_xlsx_workbook(workbook, office_metadata);
81
+ }
82
+
83
+ // For other formats, use open_workbook_auto
63
84
  let workbook = match open_workbook_auto(Path::new(file_path)) {
64
85
  Ok(wb) => wb,
65
86
  Err(calamine::Error::Io(io_err)) => {
@@ -94,7 +115,7 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
94
115
  ".xlsx" | ".xlsm" | ".xlam" | ".xltm" => {
95
116
  let workbook = calamine::Xlsx::new(cursor)
96
117
  .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
97
- process_workbook(workbook, office_metadata)
118
+ process_xlsx_workbook(workbook, office_metadata)
98
119
  }
99
120
  ".xls" | ".xla" => {
100
121
  let workbook = calamine::Xls::new(cursor)
@@ -118,6 +139,194 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
118
139
  }
119
140
  }
120
141
 
142
+ /// Process XLSX workbooks with special handling for pathological sparse files.
143
+ ///
144
+ /// This function uses calamine's `worksheet_cells_reader()` API to detect sheets with
145
+ /// extreme bounding boxes BEFORE allocating memory for the full Range. This prevents
146
+ /// OOM when processing files like Excel Solver files that have cells at both A1 and
147
+ /// XFD1048575, creating a bounding box of ~17 billion cells.
148
+ fn process_xlsx_workbook<RS: Read + Seek>(
149
+ mut workbook: calamine::Xlsx<RS>,
150
+ office_metadata: Option<HashMap<String, String>>,
151
+ ) -> Result<ExcelWorkbook> {
152
+ let sheet_names = workbook.sheet_names();
153
+ let mut sheets = Vec::with_capacity(sheet_names.len());
154
+
155
+ for name in &sheet_names {
156
+ // Use worksheet_cells_reader to stream cells and detect pathological bounding boxes
157
+ match process_xlsx_sheet_safe(&mut workbook, name) {
158
+ Ok(sheet) => sheets.push(sheet),
159
+ Err(e) => {
160
+ // Log but don't fail - continue with other sheets
161
+ tracing::warn!("Failed to process sheet '{}': {}", name, e);
162
+ }
163
+ }
164
+ }
165
+
166
+ let metadata = extract_metadata(&workbook, &sheet_names, office_metadata);
167
+ Ok(ExcelWorkbook { sheets, metadata })
168
+ }
169
+
170
+ /// Process a single XLSX sheet safely by pre-checking the bounding box.
171
+ ///
172
+ /// This function streams cells to compute the actual bounding box without allocating
173
+ /// a full Range, then only creates the Range if the bounding box is within safe limits.
174
+ fn process_xlsx_sheet_safe<RS: Read + Seek>(workbook: &mut calamine::Xlsx<RS>, sheet_name: &str) -> Result<ExcelSheet> {
175
+ // First pass: stream cells to compute actual bounding box and collect cell data
176
+ let (cells, row_min, row_max, col_min, col_max) = {
177
+ let mut cell_reader = workbook
178
+ .worksheet_cells_reader(sheet_name)
179
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to read sheet '{}': {}", sheet_name, e)))?;
180
+
181
+ let mut cells: Vec<((u32, u32), Data)> = Vec::new();
182
+ let mut row_min = u32::MAX;
183
+ let mut row_max = 0u32;
184
+ let mut col_min = u32::MAX;
185
+ let mut col_max = 0u32;
186
+
187
+ // Stream through all cells, tracking bounds
188
+ while let Ok(Some(cell)) = cell_reader.next_cell() {
189
+ let (row, col) = cell.get_position();
190
+ row_min = row_min.min(row);
191
+ row_max = row_max.max(row);
192
+ col_min = col_min.min(col);
193
+ col_max = col_max.max(col);
194
+
195
+ // Convert DataRef to owned Data
196
+ let data: Data = match cell.get_value() {
197
+ DataRef::Empty => Data::Empty,
198
+ DataRef::String(s) => Data::String(s.clone()),
199
+ DataRef::SharedString(s) => Data::String(s.to_string()),
200
+ DataRef::Float(f) => Data::Float(*f),
201
+ DataRef::Int(i) => Data::Int(*i),
202
+ DataRef::Bool(b) => Data::Bool(*b),
203
+ DataRef::DateTime(dt) => Data::DateTime(*dt),
204
+ DataRef::DateTimeIso(s) => Data::DateTimeIso(s.clone()),
205
+ DataRef::DurationIso(s) => Data::DurationIso(s.clone()),
206
+ DataRef::Error(e) => Data::Error(e.clone()),
207
+ };
208
+ cells.push(((row, col), data));
209
+ }
210
+ (cells, row_min, row_max, col_min, col_max)
211
+ }; // cell_reader is dropped here, releasing the borrow
212
+
213
+ // Check if sheet is empty
214
+ if cells.is_empty() {
215
+ return Ok(ExcelSheet {
216
+ name: sheet_name.to_owned(),
217
+ markdown: format!("## {}\n\n*Empty sheet*", sheet_name),
218
+ row_count: 0,
219
+ col_count: 0,
220
+ cell_count: 0,
221
+ table_cells: None,
222
+ });
223
+ }
224
+
225
+ // Calculate bounding box size
226
+ let bb_rows = (row_max - row_min + 1) as u64;
227
+ let bb_cols = (col_max - col_min + 1) as u64;
228
+ let bb_cells = bb_rows.saturating_mul(bb_cols);
229
+
230
+ // Check for pathological bounding box
231
+ if bb_cells > MAX_BOUNDING_BOX_CELLS {
232
+ // Sheet has sparse data at extreme positions - process directly from cells
233
+ return process_sparse_sheet_from_cells(sheet_name, cells, row_min, row_max, col_min, col_max);
234
+ }
235
+
236
+ // Safe to create a Range - bounding box is within limits
237
+ // Use calamine's normal worksheet_range which will create the Range
238
+ let range = workbook
239
+ .worksheet_range(sheet_name)
240
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to parse sheet '{}': {}", sheet_name, e)))?;
241
+
242
+ Ok(process_sheet(sheet_name, &range))
243
+ }
244
+
245
+ /// Process a sparse sheet directly from collected cells without creating a full Range.
246
+ ///
247
+ /// This is used when the bounding box would exceed MAX_BOUNDING_BOX_CELLS.
248
+ /// Instead of creating a dense Range, we generate markdown directly from the sparse cells.
249
+ fn process_sparse_sheet_from_cells(
250
+ sheet_name: &str,
251
+ cells: Vec<((u32, u32), Data)>,
252
+ row_min: u32,
253
+ row_max: u32,
254
+ col_min: u32,
255
+ col_max: u32,
256
+ ) -> Result<ExcelSheet> {
257
+ let cell_count = cells.len();
258
+ let bb_rows = (row_max - row_min + 1) as usize;
259
+ let bb_cols = (col_max - col_min + 1) as usize;
260
+
261
+ // Create a warning message about the sparse data
262
+ let mut markdown = String::with_capacity(500 + cell_count * 50);
263
+ write!(
264
+ markdown,
265
+ "## {}\n\n*Note: Sheet contains sparse data spanning {} rows x {} columns ({} actual cells). \
266
+ Bounding box too large for dense extraction. Showing actual cell data below.*\n\n",
267
+ sheet_name, bb_rows, bb_cols, cell_count
268
+ )
269
+ .expect("write to String cannot fail");
270
+
271
+ // Group cells by row for tabular display
272
+ let mut cells_by_row: HashMap<u32, Vec<(u32, &Data)>> = HashMap::new();
273
+ for ((row, col), data) in &cells {
274
+ cells_by_row.entry(*row).or_default().push((*col, data));
275
+ }
276
+
277
+ // Sort rows and output as simple key-value pairs
278
+ let mut rows: Vec<_> = cells_by_row.keys().copied().collect();
279
+ rows.sort_unstable();
280
+
281
+ // Limit output to first 1000 cells to avoid huge output
282
+ let mut output_count = 0;
283
+ const MAX_OUTPUT_CELLS: usize = 1000;
284
+
285
+ for row in rows {
286
+ if output_count >= MAX_OUTPUT_CELLS {
287
+ write!(markdown, "\n... ({} more cells not shown)\n", cell_count - output_count)
288
+ .expect("write to String cannot fail");
289
+ break;
290
+ }
291
+
292
+ let mut row_cells = cells_by_row.remove(&row).unwrap_or_default();
293
+ row_cells.sort_by_key(|(col, _)| *col);
294
+
295
+ for (col, data) in row_cells {
296
+ if output_count >= MAX_OUTPUT_CELLS {
297
+ break;
298
+ }
299
+ let cell_ref = col_to_excel_letter(col);
300
+ let cell_str = format_cell_to_string(data);
301
+ if !cell_str.is_empty() {
302
+ writeln!(markdown, "- **{}{}**: {}", cell_ref, row + 1, cell_str).expect("write to String cannot fail");
303
+ output_count += 1;
304
+ }
305
+ }
306
+ }
307
+
308
+ Ok(ExcelSheet {
309
+ name: sheet_name.to_owned(),
310
+ markdown,
311
+ row_count: bb_rows,
312
+ col_count: bb_cols,
313
+ cell_count,
314
+ table_cells: None, // No structured table for sparse sheets
315
+ })
316
+ }
317
+
318
+ /// Convert a 0-indexed column number to Excel-style letter(s) (A, B, ..., Z, AA, AB, ...).
319
+ fn col_to_excel_letter(col: u32) -> String {
320
+ let mut result = String::new();
321
+ let mut n = col + 1; // 1-indexed for calculation
322
+ while n > 0 {
323
+ n -= 1;
324
+ result.insert(0, (b'A' + (n % 26) as u8) as char);
325
+ n /= 26;
326
+ }
327
+ result
328
+ }
329
+
121
330
  fn process_workbook<RS, R>(mut workbook: R, office_metadata: Option<HashMap<String, String>>) -> Result<ExcelWorkbook>
122
331
  where
123
332
  RS: std::io::Read + std::io::Seek,
@@ -143,7 +352,10 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
143
352
  let (rows, cols) = range.get_size();
144
353
  let cell_count = range.used_cells().count();
145
354
 
146
- let estimated_capacity = 50 + (cols * 20) + (rows * cols * 12);
355
+ // Fix for issue #331: Use actual cell count instead of declared dimensions
356
+ // to avoid OOM on sparse sheets with extreme dimensions (e.g., Excel Solver files).
357
+ // Declared dimensions can claim A1:XFD1048575 (~17T cells) while actual data is minimal.
358
+ let estimated_capacity = 50 + (cols * 20) + (cell_count * 12);
147
359
 
148
360
  if rows == 0 || cols == 0 {
149
361
  let markdown = format!("## {}\n\n*Empty sheet*", name);
@@ -176,6 +388,31 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
176
388
  ///
177
389
  /// Returns (markdown, table_cells) where table_cells is a 2D vector of strings.
178
390
  fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity: usize) -> (String, Vec<Vec<String>>) {
391
+ // Fix for issue #331: Protect against extreme declared dimensions.
392
+ // Excel Solver files can declare A1:XFD1048575 (1M+ rows) but only have ~26 actual cells.
393
+ // Calling range.rows().collect() would iterate ALL declared rows causing OOM.
394
+ const MAX_REASONABLE_ROWS: usize = 100_000; // Cap at 100K rows for safety
395
+
396
+ let (declared_rows, _declared_cols) = range.get_size();
397
+
398
+ // If declared rows exceed reasonable limit, skip processing to avoid OOM
399
+ if declared_rows > MAX_REASONABLE_ROWS {
400
+ let actual_cell_count = range.used_cells().count();
401
+
402
+ // If actual data is minimal compared to declared size, it's a sparse/pathological file
403
+ if actual_cell_count < 10_000 {
404
+ // Return minimal output instead of OOM
405
+ let result_capacity = 100 + sheet_name.len();
406
+ let mut result = String::with_capacity(result_capacity);
407
+ write!(
408
+ result,
409
+ "## {}\n\n*Sheet has extreme declared dimensions ({} rows) with minimal actual data ({} cells). Skipping to prevent OOM.*",
410
+ sheet_name, declared_rows, actual_cell_count
411
+ ).unwrap();
412
+ return (result, Vec::new());
413
+ }
414
+ }
415
+
179
416
  let rows: Vec<_> = range.rows().collect();
180
417
  if rows.is_empty() {
181
418
  let result_capacity = 50 + sheet_name.len();
@@ -72,28 +72,12 @@ pub(super) fn build_config(
72
72
  }
73
73
  }
74
74
 
75
- /// Format extraction result as human-readable text.
75
+ /// Format extraction result as JSON string.
76
+ ///
77
+ /// Serializes the full `ExtractionResult` to JSON, ensuring 1:1 parity
78
+ /// with the API and CLI JSON output.
76
79
  pub(super) fn format_extraction_result(result: &KreuzbergResult) -> String {
77
- let mut response = String::new();
78
-
79
- response.push_str(&format!("Content ({} characters):\n", result.content.len()));
80
- response.push_str(&result.content);
81
- response.push_str("\n\n");
82
-
83
- response.push_str("Metadata:\n");
84
- response.push_str(&serde_json::to_string_pretty(&result.metadata).unwrap_or_default());
85
- response.push_str("\n\n");
86
-
87
- if !result.tables.is_empty() {
88
- response.push_str(&format!("Tables ({}):\n", result.tables.len()));
89
- for (i, table) in result.tables.iter().enumerate() {
90
- response.push_str(&format!("\nTable {} (page {}):\n", i + 1, table.page_number));
91
- response.push_str(&table.markdown);
92
- response.push('\n');
93
- }
94
- }
95
-
96
- response
80
+ serde_json::to_string_pretty(result).unwrap_or_default()
97
81
  }
98
82
 
99
83
  #[cfg(test)]
@@ -303,7 +287,7 @@ mod tests {
303
287
  }
304
288
 
305
289
  #[test]
306
- fn test_format_extraction_result_with_content() {
290
+ fn test_format_extraction_result_is_valid_json() {
307
291
  let result = KreuzbergResult {
308
292
  content: "Sample extracted text".to_string(),
309
293
  mime_type: "text/plain".to_string(),
@@ -318,36 +302,27 @@ mod tests {
318
302
  };
319
303
 
320
304
  let formatted = format_extraction_result(&result);
305
+ let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
321
306
 
322
- assert!(formatted.contains("Content (21 characters)"));
323
- assert!(formatted.contains("Sample extracted text"));
324
- assert!(formatted.contains("Metadata:"));
307
+ assert_eq!(parsed["content"], "Sample extracted text");
308
+ assert_eq!(parsed["mime_type"], "text/plain");
309
+ assert!(parsed["metadata"].is_object());
325
310
  }
326
311
 
327
312
  #[test]
328
- fn test_format_extraction_result_with_tables() {
313
+ fn test_format_extraction_result_includes_tables() {
329
314
  let result = KreuzbergResult {
330
315
  content: "Document with tables".to_string(),
331
316
  mime_type: "application/pdf".to_string(),
332
317
  metadata: crate::Metadata::default(),
333
- tables: vec![
334
- crate::Table {
335
- cells: vec![
336
- vec!["Col1".to_string(), "Col2".to_string()],
337
- vec!["A".to_string(), "B".to_string()],
338
- ],
339
- page_number: 1,
340
- markdown: "| Col1 | Col2 |\n|------|------|\n| A | B |".to_string(),
341
- },
342
- crate::Table {
343
- cells: vec![
344
- vec!["X".to_string(), "Y".to_string()],
345
- vec!["1".to_string(), "2".to_string()],
346
- ],
347
- page_number: 2,
348
- markdown: "| X | Y |\n|---|---|\n| 1 | 2 |".to_string(),
349
- },
350
- ],
318
+ tables: vec![crate::Table {
319
+ cells: vec![
320
+ vec!["Col1".to_string(), "Col2".to_string()],
321
+ vec!["A".to_string(), "B".to_string()],
322
+ ],
323
+ page_number: 1,
324
+ markdown: "| Col1 | Col2 |\n|------|------|\n| A | B |".to_string(),
325
+ }],
351
326
  detected_languages: None,
352
327
  chunks: None,
353
328
  images: None,
@@ -357,23 +332,33 @@ mod tests {
357
332
  };
358
333
 
359
334
  let formatted = format_extraction_result(&result);
335
+ let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
360
336
 
361
- assert!(formatted.contains("Tables (2)"));
362
- assert!(formatted.contains("Table 1 (page 1)"));
363
- assert!(formatted.contains("Table 2 (page 2)"));
364
- assert!(formatted.contains("| Col1 | Col2 |"));
365
- assert!(formatted.contains("| X | Y |"));
337
+ assert_eq!(parsed["tables"].as_array().unwrap().len(), 1);
338
+ assert_eq!(parsed["tables"][0]["page_number"], 1);
366
339
  }
367
340
 
368
341
  #[test]
369
- fn test_format_extraction_result_empty_content() {
342
+ fn test_format_extraction_result_includes_chunks_when_present() {
370
343
  let result = KreuzbergResult {
371
- content: String::new(),
344
+ content: "Chunked text".to_string(),
372
345
  mime_type: "text/plain".to_string(),
373
346
  metadata: crate::Metadata::default(),
374
347
  tables: vec![],
375
348
  detected_languages: None,
376
- chunks: None,
349
+ chunks: Some(vec![crate::Chunk {
350
+ content: "Chunk 1".to_string(),
351
+ embedding: None,
352
+ metadata: crate::ChunkMetadata {
353
+ byte_start: 0,
354
+ byte_end: 7,
355
+ token_count: None,
356
+ chunk_index: 0,
357
+ total_chunks: 1,
358
+ first_page: None,
359
+ last_page: None,
360
+ },
361
+ }]),
377
362
  images: None,
378
363
  pages: None,
379
364
  elements: None,
@@ -381,13 +366,14 @@ mod tests {
381
366
  };
382
367
 
383
368
  let formatted = format_extraction_result(&result);
369
+ let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
384
370
 
385
- assert!(formatted.contains("Content (0 characters)"));
386
- assert!(formatted.contains("Metadata:"));
371
+ assert_eq!(parsed["chunks"].as_array().unwrap().len(), 1);
372
+ assert_eq!(parsed["chunks"][0]["content"], "Chunk 1");
387
373
  }
388
374
 
389
375
  #[test]
390
- fn test_format_extraction_result_no_tables() {
376
+ fn test_format_extraction_result_omits_none_fields() {
391
377
  let result = KreuzbergResult {
392
378
  content: "Simple text".to_string(),
393
379
  mime_type: "text/plain".to_string(),
@@ -402,8 +388,11 @@ mod tests {
402
388
  };
403
389
 
404
390
  let formatted = format_extraction_result(&result);
391
+ let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
405
392
 
406
- assert!(formatted.contains("Simple text"));
407
- assert!(!formatted.contains("Tables"));
393
+ // None fields should be omitted via skip_serializing_if
394
+ assert!(parsed.get("chunks").is_none());
395
+ assert!(parsed.get("images").is_none());
396
+ assert!(parsed.get("detected_languages").is_none());
408
397
  }
409
398
  }