kreuzberg 4.2.0 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +59 -28
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +23 -11
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/config_spec.rb +1 -1
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/spec/unit/config/output_format_spec.rb +18 -18
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +60 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +21 -1
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
- data/vendor/kreuzberg/tests/core_integration.rs +2 -4
- data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +5 -2
|
@@ -109,19 +109,41 @@ impl ApiSizeLimits {
|
|
|
109
109
|
}
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
+
/// Plugin status information in health response.
|
|
113
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
114
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
115
|
+
pub struct PluginStatus {
|
|
116
|
+
/// Number of registered OCR backends
|
|
117
|
+
pub ocr_backends_count: usize,
|
|
118
|
+
/// Names of registered OCR backends
|
|
119
|
+
pub ocr_backends: Vec<String>,
|
|
120
|
+
/// Number of registered document extractors
|
|
121
|
+
pub extractors_count: usize,
|
|
122
|
+
/// Number of registered post-processors
|
|
123
|
+
pub post_processors_count: usize,
|
|
124
|
+
}
|
|
125
|
+
|
|
112
126
|
/// Health check response.
|
|
113
127
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
128
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
114
129
|
pub struct HealthResponse {
|
|
115
130
|
/// Health status
|
|
131
|
+
#[cfg_attr(feature = "api", schema(example = "healthy"))]
|
|
116
132
|
pub status: String,
|
|
117
133
|
/// API version
|
|
134
|
+
#[cfg_attr(feature = "api", schema(example = "0.8.0"))]
|
|
118
135
|
pub version: String,
|
|
136
|
+
/// Plugin status (optional)
|
|
137
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
138
|
+
pub plugins: Option<PluginStatus>,
|
|
119
139
|
}
|
|
120
140
|
|
|
121
141
|
/// Server information response.
|
|
122
142
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
143
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
123
144
|
pub struct InfoResponse {
|
|
124
145
|
/// API version
|
|
146
|
+
#[cfg_attr(feature = "api", schema(example = "0.8.0"))]
|
|
125
147
|
pub version: String,
|
|
126
148
|
/// Whether using Rust backend
|
|
127
149
|
pub rust_backend: bool,
|
|
@@ -132,15 +154,19 @@ pub type ExtractResponse = Vec<ExtractionResult>;
|
|
|
132
154
|
|
|
133
155
|
/// Error response.
|
|
134
156
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
157
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
135
158
|
pub struct ErrorResponse {
|
|
136
159
|
/// Error type name
|
|
160
|
+
#[cfg_attr(feature = "api", schema(example = "ValidationError"))]
|
|
137
161
|
pub error_type: String,
|
|
138
162
|
/// Error message
|
|
163
|
+
#[cfg_attr(feature = "api", schema(example = "Invalid input provided"))]
|
|
139
164
|
pub message: String,
|
|
140
165
|
/// Stack trace (if available)
|
|
141
166
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
142
167
|
pub traceback: Option<String>,
|
|
143
168
|
/// HTTP status code
|
|
169
|
+
#[cfg_attr(feature = "api", schema(example = 400))]
|
|
144
170
|
pub status_code: u16,
|
|
145
171
|
}
|
|
146
172
|
|
|
@@ -156,8 +182,10 @@ pub struct ApiState {
|
|
|
156
182
|
|
|
157
183
|
/// Cache statistics response.
|
|
158
184
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
185
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
159
186
|
pub struct CacheStatsResponse {
|
|
160
187
|
/// Cache directory path
|
|
188
|
+
#[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
|
|
161
189
|
pub directory: String,
|
|
162
190
|
/// Total number of cache files
|
|
163
191
|
pub total_files: usize,
|
|
@@ -173,8 +201,10 @@ pub struct CacheStatsResponse {
|
|
|
173
201
|
|
|
174
202
|
/// Cache clear response.
|
|
175
203
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
204
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
176
205
|
pub struct CacheClearResponse {
|
|
177
206
|
/// Cache directory path
|
|
207
|
+
#[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
|
|
178
208
|
pub directory: String,
|
|
179
209
|
/// Number of files removed
|
|
180
210
|
pub removed_files: usize,
|
|
@@ -184,20 +214,25 @@ pub struct CacheClearResponse {
|
|
|
184
214
|
|
|
185
215
|
/// Embedding request for generating embeddings from text.
|
|
186
216
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
217
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
187
218
|
pub struct EmbedRequest {
|
|
188
|
-
/// Text strings to generate embeddings for
|
|
219
|
+
/// Text strings to generate embeddings for (at least one non-empty string required)
|
|
220
|
+
#[cfg_attr(feature = "api", schema(min_items = 1))]
|
|
189
221
|
pub texts: Vec<String>,
|
|
190
222
|
/// Optional embedding configuration (model, batch size, etc.)
|
|
191
223
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
224
|
+
#[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
|
|
192
225
|
pub config: Option<crate::core::config::EmbeddingConfig>,
|
|
193
226
|
}
|
|
194
227
|
|
|
195
228
|
/// Embedding response containing generated embeddings.
|
|
196
229
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
230
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
197
231
|
pub struct EmbedResponse {
|
|
198
232
|
/// Generated embeddings (one per input text)
|
|
199
233
|
pub embeddings: Vec<Vec<f32>>,
|
|
200
234
|
/// Model used for embedding generation
|
|
235
|
+
#[cfg_attr(feature = "api", schema(example = "all-MiniLM-L6-v2"))]
|
|
201
236
|
pub model: String,
|
|
202
237
|
/// Dimensionality of the embeddings
|
|
203
238
|
pub dimensions: usize,
|
|
@@ -212,23 +247,29 @@ fn default_chunker_type() -> String {
|
|
|
212
247
|
|
|
213
248
|
/// Chunk request with text and configuration.
|
|
214
249
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
250
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
215
251
|
pub struct ChunkRequest {
|
|
216
|
-
/// Text to chunk
|
|
252
|
+
/// Text to chunk (must not be empty)
|
|
253
|
+
#[cfg_attr(feature = "api", schema(example = "This is sample text to chunk.", min_length = 1))]
|
|
217
254
|
pub text: String,
|
|
218
255
|
/// Optional chunking configuration
|
|
219
256
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
220
257
|
pub config: Option<ChunkingConfigRequest>,
|
|
221
258
|
/// Chunker type (text or markdown)
|
|
222
259
|
#[serde(default = "default_chunker_type")]
|
|
260
|
+
#[cfg_attr(feature = "api", schema(example = "text", pattern = "^(text|markdown)$"))]
|
|
223
261
|
pub chunker_type: String,
|
|
224
262
|
}
|
|
225
263
|
|
|
226
264
|
/// Chunking configuration request.
|
|
227
265
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
|
266
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
228
267
|
pub struct ChunkingConfigRequest {
|
|
229
|
-
/// Maximum characters per chunk
|
|
268
|
+
/// Maximum characters per chunk (must be greater than overlap, default: 2000)
|
|
269
|
+
#[cfg_attr(feature = "api", schema(minimum = 101, example = 2000))]
|
|
230
270
|
pub max_characters: Option<usize>,
|
|
231
|
-
/// Overlap between chunks in characters
|
|
271
|
+
/// Overlap between chunks in characters (must be less than max_characters, default: 100)
|
|
272
|
+
#[cfg_attr(feature = "api", schema(minimum = 0, maximum = 1999, example = 100))]
|
|
232
273
|
pub overlap: Option<usize>,
|
|
233
274
|
/// Whether to trim whitespace
|
|
234
275
|
pub trim: Option<bool>,
|
|
@@ -236,6 +277,7 @@ pub struct ChunkingConfigRequest {
|
|
|
236
277
|
|
|
237
278
|
/// Chunk response with chunks and metadata.
|
|
238
279
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
280
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
239
281
|
pub struct ChunkResponse {
|
|
240
282
|
/// List of chunks
|
|
241
283
|
pub chunks: Vec<ChunkItem>,
|
|
@@ -246,11 +288,13 @@ pub struct ChunkResponse {
|
|
|
246
288
|
/// Input text size in bytes
|
|
247
289
|
pub input_size_bytes: usize,
|
|
248
290
|
/// Chunker type used for chunking
|
|
291
|
+
#[cfg_attr(feature = "api", schema(example = "text"))]
|
|
249
292
|
pub chunker_type: String,
|
|
250
293
|
}
|
|
251
294
|
|
|
252
295
|
/// Individual chunk item with metadata.
|
|
253
296
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
297
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
254
298
|
pub struct ChunkItem {
|
|
255
299
|
/// Chunk content
|
|
256
300
|
pub content: String,
|
|
@@ -272,6 +316,7 @@ pub struct ChunkItem {
|
|
|
272
316
|
|
|
273
317
|
/// Chunking configuration response.
|
|
274
318
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
319
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
275
320
|
pub struct ChunkingConfigResponse {
|
|
276
321
|
/// Maximum characters per chunk
|
|
277
322
|
pub max_characters: usize,
|
|
@@ -280,5 +325,6 @@ pub struct ChunkingConfigResponse {
|
|
|
280
325
|
/// Whether whitespace was trimmed
|
|
281
326
|
pub trim: bool,
|
|
282
327
|
/// Type of chunker used
|
|
328
|
+
#[cfg_attr(feature = "api", schema(example = "text"))]
|
|
283
329
|
pub chunker_type: String,
|
|
284
330
|
}
|
|
@@ -84,7 +84,8 @@ pub struct ChunkingConfig {
|
|
|
84
84
|
/// Requires the `embeddings` feature to be enabled.
|
|
85
85
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
86
86
|
pub struct EmbeddingConfig {
|
|
87
|
-
/// The embedding model to use
|
|
87
|
+
/// The embedding model to use (defaults to "balanced" preset if not specified)
|
|
88
|
+
#[serde(default = "default_model")]
|
|
88
89
|
pub model: EmbeddingModelType,
|
|
89
90
|
|
|
90
91
|
/// Whether to normalize embedding vectors (recommended for cosine similarity)
|
|
@@ -156,6 +157,12 @@ fn default_batch_size() -> usize {
|
|
|
156
157
|
32
|
|
157
158
|
}
|
|
158
159
|
|
|
160
|
+
fn default_model() -> EmbeddingModelType {
|
|
161
|
+
EmbeddingModelType::Preset {
|
|
162
|
+
name: "balanced".to_string(),
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
159
166
|
#[cfg(test)]
|
|
160
167
|
mod tests {
|
|
161
168
|
use super::*;
|
|
@@ -30,8 +30,10 @@ const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
|
|
|
30
30
|
/// Valid tesseract OEM (OCR Engine Mode) values.
|
|
31
31
|
const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
|
|
32
32
|
|
|
33
|
-
/// Valid output formats for
|
|
34
|
-
|
|
33
|
+
/// Valid output formats for document extraction.
|
|
34
|
+
/// Supports plain text, markdown, djot, and HTML output formats.
|
|
35
|
+
/// Also accepts aliases: "text" for "plain", "md" for "markdown".
|
|
36
|
+
const VALID_OUTPUT_FORMATS: &[&str] = &["plain", "text", "markdown", "md", "djot", "html"];
|
|
35
37
|
|
|
36
38
|
/// Validate a binarization method string.
|
|
37
39
|
///
|
|
@@ -248,11 +250,17 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
|
|
|
248
250
|
}
|
|
249
251
|
}
|
|
250
252
|
|
|
251
|
-
/// Validate a
|
|
253
|
+
/// Validate a document extraction output format.
|
|
254
|
+
///
|
|
255
|
+
/// Accepts the following formats and aliases:
|
|
256
|
+
/// - "plain" or "text" for plain text output
|
|
257
|
+
/// - "markdown" or "md" for Markdown output
|
|
258
|
+
/// - "djot" for Djot markup format
|
|
259
|
+
/// - "html" for HTML output
|
|
252
260
|
///
|
|
253
261
|
/// # Arguments
|
|
254
262
|
///
|
|
255
|
-
/// * `format` - The output format to validate
|
|
263
|
+
/// * `format` - The output format to validate
|
|
256
264
|
///
|
|
257
265
|
/// # Returns
|
|
258
266
|
///
|
|
@@ -264,7 +272,11 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
|
|
|
264
272
|
/// use kreuzberg::core::config_validation::validate_output_format;
|
|
265
273
|
///
|
|
266
274
|
/// assert!(validate_output_format("text").is_ok());
|
|
275
|
+
/// assert!(validate_output_format("plain").is_ok());
|
|
267
276
|
/// assert!(validate_output_format("markdown").is_ok());
|
|
277
|
+
/// assert!(validate_output_format("md").is_ok());
|
|
278
|
+
/// assert!(validate_output_format("djot").is_ok());
|
|
279
|
+
/// assert!(validate_output_format("html").is_ok());
|
|
268
280
|
/// assert!(validate_output_format("json").is_err());
|
|
269
281
|
/// ```
|
|
270
282
|
pub fn validate_output_format(format: &str) -> Result<()> {
|
|
@@ -106,9 +106,8 @@ pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
|
|
|
106
106
|
///
|
|
107
107
|
/// # Errors
|
|
108
108
|
///
|
|
109
|
-
/// Returns `KreuzbergError::
|
|
109
|
+
/// Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
|
|
110
110
|
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
|
111
|
-
/// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
|
|
112
111
|
///
|
|
113
112
|
/// # Example
|
|
114
113
|
///
|
|
@@ -411,7 +411,8 @@ mod tests {
|
|
|
411
411
|
|
|
412
412
|
assert!(result.is_err());
|
|
413
413
|
use crate::KreuzbergError;
|
|
414
|
-
|
|
414
|
+
// File validation returns Io error, not Validation error
|
|
415
|
+
assert!(matches!(result.unwrap_err(), KreuzbergError::Io { .. }));
|
|
415
416
|
}
|
|
416
417
|
|
|
417
418
|
#[test]
|
|
@@ -61,12 +61,12 @@ pub fn file_exists(path: impl AsRef<Path>) -> bool {
|
|
|
61
61
|
///
|
|
62
62
|
/// # Errors
|
|
63
63
|
///
|
|
64
|
-
/// Returns `KreuzbergError::
|
|
64
|
+
/// Returns `KreuzbergError::Io` if file doesn't exist.
|
|
65
65
|
pub fn validate_file_exists(path: impl AsRef<Path>) -> Result<()> {
|
|
66
66
|
if !file_exists(&path) {
|
|
67
|
-
return Err(KreuzbergError::
|
|
68
|
-
|
|
69
|
-
path.as_ref().display()
|
|
67
|
+
return Err(KreuzbergError::from(std::io::Error::new(
|
|
68
|
+
std::io::ErrorKind::NotFound,
|
|
69
|
+
format!("File does not exist: {}", path.as_ref().display()),
|
|
70
70
|
)));
|
|
71
71
|
}
|
|
72
72
|
Ok(())
|
|
@@ -99,9 +99,9 @@ where
|
|
|
99
99
|
let mut files = Vec::new();
|
|
100
100
|
|
|
101
101
|
if !dir.is_dir() {
|
|
102
|
-
return Err(KreuzbergError::
|
|
103
|
-
|
|
104
|
-
dir.display()
|
|
102
|
+
return Err(KreuzbergError::from(std::io::Error::new(
|
|
103
|
+
std::io::ErrorKind::NotADirectory,
|
|
104
|
+
format!("Path is not a directory: {}", dir.display()),
|
|
105
105
|
)));
|
|
106
106
|
}
|
|
107
107
|
|
|
@@ -231,15 +231,15 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
231
231
|
///
|
|
232
232
|
/// # Errors
|
|
233
233
|
///
|
|
234
|
-
/// Returns `KreuzbergError::
|
|
234
|
+
/// Returns `KreuzbergError::Io` if file doesn't exist (when `check_exists` is true).
|
|
235
235
|
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
|
236
236
|
pub fn detect_mime_type(path: impl AsRef<Path>, check_exists: bool) -> Result<String> {
|
|
237
237
|
let path = path.as_ref();
|
|
238
238
|
|
|
239
239
|
if check_exists && !path.exists() {
|
|
240
|
-
return Err(KreuzbergError::
|
|
241
|
-
|
|
242
|
-
path.display()
|
|
240
|
+
return Err(KreuzbergError::from(std::io::Error::new(
|
|
241
|
+
std::io::ErrorKind::NotFound,
|
|
242
|
+
format!("File does not exist: {}", path.display()),
|
|
243
243
|
)));
|
|
244
244
|
}
|
|
245
245
|
|
|
@@ -27,16 +27,23 @@
|
|
|
27
27
|
//! # Ok(())
|
|
28
28
|
//! # }
|
|
29
29
|
//! ```
|
|
30
|
-
use calamine::{Data, Range, Reader, open_workbook_auto};
|
|
30
|
+
use calamine::{Data, DataRef, Range, Reader, open_workbook_auto};
|
|
31
31
|
use std::collections::HashMap;
|
|
32
32
|
use std::fmt::Write as FmtWrite;
|
|
33
|
-
use std::io::Cursor;
|
|
33
|
+
use std::io::{Cursor, Read, Seek};
|
|
34
34
|
use std::path::Path;
|
|
35
35
|
|
|
36
36
|
use crate::error::{KreuzbergError, Result};
|
|
37
37
|
use crate::extraction::capacity;
|
|
38
38
|
use crate::types::{ExcelSheet, ExcelWorkbook};
|
|
39
39
|
|
|
40
|
+
/// Maximum number of cells in a Range's bounding box before we consider it pathological.
|
|
41
|
+
/// This threshold is set to prevent OOM when processing files with sparse data at extreme
|
|
42
|
+
/// positions (e.g., Excel Solver files that have cells at A1 and XFD1048575).
|
|
43
|
+
///
|
|
44
|
+
/// 100 million cells at ~64 bytes each = ~6.4 GB, which is a reasonable upper limit.
|
|
45
|
+
const MAX_BOUNDING_BOX_CELLS: u64 = 100_000_000;
|
|
46
|
+
|
|
40
47
|
#[cfg(feature = "office")]
|
|
41
48
|
use crate::extraction::office_metadata::{
|
|
42
49
|
extract_core_properties, extract_custom_properties, extract_xlsx_app_properties,
|
|
@@ -45,11 +52,13 @@ use crate::extraction::office_metadata::{
|
|
|
45
52
|
use serde_json::Value;
|
|
46
53
|
|
|
47
54
|
pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
|
|
55
|
+
let lower_path = file_path.to_lowercase();
|
|
56
|
+
|
|
48
57
|
#[cfg(feature = "office")]
|
|
49
|
-
let office_metadata = if
|
|
50
|
-
||
|
|
51
|
-
||
|
|
52
|
-
||
|
|
58
|
+
let office_metadata = if lower_path.ends_with(".xlsx")
|
|
59
|
+
|| lower_path.ends_with(".xlsm")
|
|
60
|
+
|| lower_path.ends_with(".xlam")
|
|
61
|
+
|| lower_path.ends_with(".xltm")
|
|
53
62
|
{
|
|
54
63
|
extract_xlsx_office_metadata_from_file(file_path).ok()
|
|
55
64
|
} else {
|
|
@@ -59,7 +68,19 @@ pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
|
|
|
59
68
|
#[cfg(not(feature = "office"))]
|
|
60
69
|
let office_metadata: Option<HashMap<String, String>> = None;
|
|
61
70
|
|
|
62
|
-
//
|
|
71
|
+
// For XLSX files, use specialized handler with OOM protection
|
|
72
|
+
if lower_path.ends_with(".xlsx")
|
|
73
|
+
|| lower_path.ends_with(".xlsm")
|
|
74
|
+
|| lower_path.ends_with(".xlam")
|
|
75
|
+
|| lower_path.ends_with(".xltm")
|
|
76
|
+
{
|
|
77
|
+
let file = std::fs::File::open(file_path)?;
|
|
78
|
+
let workbook = calamine::Xlsx::new(std::io::BufReader::new(file))
|
|
79
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
|
|
80
|
+
return process_xlsx_workbook(workbook, office_metadata);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// For other formats, use open_workbook_auto
|
|
63
84
|
let workbook = match open_workbook_auto(Path::new(file_path)) {
|
|
64
85
|
Ok(wb) => wb,
|
|
65
86
|
Err(calamine::Error::Io(io_err)) => {
|
|
@@ -94,7 +115,7 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
|
|
|
94
115
|
".xlsx" | ".xlsm" | ".xlam" | ".xltm" => {
|
|
95
116
|
let workbook = calamine::Xlsx::new(cursor)
|
|
96
117
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
|
|
97
|
-
|
|
118
|
+
process_xlsx_workbook(workbook, office_metadata)
|
|
98
119
|
}
|
|
99
120
|
".xls" | ".xla" => {
|
|
100
121
|
let workbook = calamine::Xls::new(cursor)
|
|
@@ -118,6 +139,194 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
|
|
|
118
139
|
}
|
|
119
140
|
}
|
|
120
141
|
|
|
142
|
+
/// Process XLSX workbooks with special handling for pathological sparse files.
|
|
143
|
+
///
|
|
144
|
+
/// This function uses calamine's `worksheet_cells_reader()` API to detect sheets with
|
|
145
|
+
/// extreme bounding boxes BEFORE allocating memory for the full Range. This prevents
|
|
146
|
+
/// OOM when processing files like Excel Solver files that have cells at both A1 and
|
|
147
|
+
/// XFD1048575, creating a bounding box of ~17 billion cells.
|
|
148
|
+
fn process_xlsx_workbook<RS: Read + Seek>(
|
|
149
|
+
mut workbook: calamine::Xlsx<RS>,
|
|
150
|
+
office_metadata: Option<HashMap<String, String>>,
|
|
151
|
+
) -> Result<ExcelWorkbook> {
|
|
152
|
+
let sheet_names = workbook.sheet_names();
|
|
153
|
+
let mut sheets = Vec::with_capacity(sheet_names.len());
|
|
154
|
+
|
|
155
|
+
for name in &sheet_names {
|
|
156
|
+
// Use worksheet_cells_reader to stream cells and detect pathological bounding boxes
|
|
157
|
+
match process_xlsx_sheet_safe(&mut workbook, name) {
|
|
158
|
+
Ok(sheet) => sheets.push(sheet),
|
|
159
|
+
Err(e) => {
|
|
160
|
+
// Log but don't fail - continue with other sheets
|
|
161
|
+
tracing::warn!("Failed to process sheet '{}': {}", name, e);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
let metadata = extract_metadata(&workbook, &sheet_names, office_metadata);
|
|
167
|
+
Ok(ExcelWorkbook { sheets, metadata })
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/// Process a single XLSX sheet safely by pre-checking the bounding box.
|
|
171
|
+
///
|
|
172
|
+
/// This function streams cells to compute the actual bounding box without allocating
|
|
173
|
+
/// a full Range, then only creates the Range if the bounding box is within safe limits.
|
|
174
|
+
fn process_xlsx_sheet_safe<RS: Read + Seek>(workbook: &mut calamine::Xlsx<RS>, sheet_name: &str) -> Result<ExcelSheet> {
|
|
175
|
+
// First pass: stream cells to compute actual bounding box and collect cell data
|
|
176
|
+
let (cells, row_min, row_max, col_min, col_max) = {
|
|
177
|
+
let mut cell_reader = workbook
|
|
178
|
+
.worksheet_cells_reader(sheet_name)
|
|
179
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read sheet '{}': {}", sheet_name, e)))?;
|
|
180
|
+
|
|
181
|
+
let mut cells: Vec<((u32, u32), Data)> = Vec::new();
|
|
182
|
+
let mut row_min = u32::MAX;
|
|
183
|
+
let mut row_max = 0u32;
|
|
184
|
+
let mut col_min = u32::MAX;
|
|
185
|
+
let mut col_max = 0u32;
|
|
186
|
+
|
|
187
|
+
// Stream through all cells, tracking bounds
|
|
188
|
+
while let Ok(Some(cell)) = cell_reader.next_cell() {
|
|
189
|
+
let (row, col) = cell.get_position();
|
|
190
|
+
row_min = row_min.min(row);
|
|
191
|
+
row_max = row_max.max(row);
|
|
192
|
+
col_min = col_min.min(col);
|
|
193
|
+
col_max = col_max.max(col);
|
|
194
|
+
|
|
195
|
+
// Convert DataRef to owned Data
|
|
196
|
+
let data: Data = match cell.get_value() {
|
|
197
|
+
DataRef::Empty => Data::Empty,
|
|
198
|
+
DataRef::String(s) => Data::String(s.clone()),
|
|
199
|
+
DataRef::SharedString(s) => Data::String(s.to_string()),
|
|
200
|
+
DataRef::Float(f) => Data::Float(*f),
|
|
201
|
+
DataRef::Int(i) => Data::Int(*i),
|
|
202
|
+
DataRef::Bool(b) => Data::Bool(*b),
|
|
203
|
+
DataRef::DateTime(dt) => Data::DateTime(*dt),
|
|
204
|
+
DataRef::DateTimeIso(s) => Data::DateTimeIso(s.clone()),
|
|
205
|
+
DataRef::DurationIso(s) => Data::DurationIso(s.clone()),
|
|
206
|
+
DataRef::Error(e) => Data::Error(e.clone()),
|
|
207
|
+
};
|
|
208
|
+
cells.push(((row, col), data));
|
|
209
|
+
}
|
|
210
|
+
(cells, row_min, row_max, col_min, col_max)
|
|
211
|
+
}; // cell_reader is dropped here, releasing the borrow
|
|
212
|
+
|
|
213
|
+
// Check if sheet is empty
|
|
214
|
+
if cells.is_empty() {
|
|
215
|
+
return Ok(ExcelSheet {
|
|
216
|
+
name: sheet_name.to_owned(),
|
|
217
|
+
markdown: format!("## {}\n\n*Empty sheet*", sheet_name),
|
|
218
|
+
row_count: 0,
|
|
219
|
+
col_count: 0,
|
|
220
|
+
cell_count: 0,
|
|
221
|
+
table_cells: None,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Calculate bounding box size
|
|
226
|
+
let bb_rows = (row_max - row_min + 1) as u64;
|
|
227
|
+
let bb_cols = (col_max - col_min + 1) as u64;
|
|
228
|
+
let bb_cells = bb_rows.saturating_mul(bb_cols);
|
|
229
|
+
|
|
230
|
+
// Check for pathological bounding box
|
|
231
|
+
if bb_cells > MAX_BOUNDING_BOX_CELLS {
|
|
232
|
+
// Sheet has sparse data at extreme positions - process directly from cells
|
|
233
|
+
return process_sparse_sheet_from_cells(sheet_name, cells, row_min, row_max, col_min, col_max);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Safe to create a Range - bounding box is within limits
|
|
237
|
+
// Use calamine's normal worksheet_range which will create the Range
|
|
238
|
+
let range = workbook
|
|
239
|
+
.worksheet_range(sheet_name)
|
|
240
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse sheet '{}': {}", sheet_name, e)))?;
|
|
241
|
+
|
|
242
|
+
Ok(process_sheet(sheet_name, &range))
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
/// Process a sparse sheet directly from collected cells without creating a full Range.
|
|
246
|
+
///
|
|
247
|
+
/// This is used when the bounding box would exceed MAX_BOUNDING_BOX_CELLS.
|
|
248
|
+
/// Instead of creating a dense Range, we generate markdown directly from the sparse cells.
|
|
249
|
+
fn process_sparse_sheet_from_cells(
|
|
250
|
+
sheet_name: &str,
|
|
251
|
+
cells: Vec<((u32, u32), Data)>,
|
|
252
|
+
row_min: u32,
|
|
253
|
+
row_max: u32,
|
|
254
|
+
col_min: u32,
|
|
255
|
+
col_max: u32,
|
|
256
|
+
) -> Result<ExcelSheet> {
|
|
257
|
+
let cell_count = cells.len();
|
|
258
|
+
let bb_rows = (row_max - row_min + 1) as usize;
|
|
259
|
+
let bb_cols = (col_max - col_min + 1) as usize;
|
|
260
|
+
|
|
261
|
+
// Create a warning message about the sparse data
|
|
262
|
+
let mut markdown = String::with_capacity(500 + cell_count * 50);
|
|
263
|
+
write!(
|
|
264
|
+
markdown,
|
|
265
|
+
"## {}\n\n*Note: Sheet contains sparse data spanning {} rows x {} columns ({} actual cells). \
|
|
266
|
+
Bounding box too large for dense extraction. Showing actual cell data below.*\n\n",
|
|
267
|
+
sheet_name, bb_rows, bb_cols, cell_count
|
|
268
|
+
)
|
|
269
|
+
.expect("write to String cannot fail");
|
|
270
|
+
|
|
271
|
+
// Group cells by row for tabular display
|
|
272
|
+
let mut cells_by_row: HashMap<u32, Vec<(u32, &Data)>> = HashMap::new();
|
|
273
|
+
for ((row, col), data) in &cells {
|
|
274
|
+
cells_by_row.entry(*row).or_default().push((*col, data));
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Sort rows and output as simple key-value pairs
|
|
278
|
+
let mut rows: Vec<_> = cells_by_row.keys().copied().collect();
|
|
279
|
+
rows.sort_unstable();
|
|
280
|
+
|
|
281
|
+
// Limit output to first 1000 cells to avoid huge output
|
|
282
|
+
let mut output_count = 0;
|
|
283
|
+
const MAX_OUTPUT_CELLS: usize = 1000;
|
|
284
|
+
|
|
285
|
+
for row in rows {
|
|
286
|
+
if output_count >= MAX_OUTPUT_CELLS {
|
|
287
|
+
write!(markdown, "\n... ({} more cells not shown)\n", cell_count - output_count)
|
|
288
|
+
.expect("write to String cannot fail");
|
|
289
|
+
break;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
let mut row_cells = cells_by_row.remove(&row).unwrap_or_default();
|
|
293
|
+
row_cells.sort_by_key(|(col, _)| *col);
|
|
294
|
+
|
|
295
|
+
for (col, data) in row_cells {
|
|
296
|
+
if output_count >= MAX_OUTPUT_CELLS {
|
|
297
|
+
break;
|
|
298
|
+
}
|
|
299
|
+
let cell_ref = col_to_excel_letter(col);
|
|
300
|
+
let cell_str = format_cell_to_string(data);
|
|
301
|
+
if !cell_str.is_empty() {
|
|
302
|
+
writeln!(markdown, "- **{}{}**: {}", cell_ref, row + 1, cell_str).expect("write to String cannot fail");
|
|
303
|
+
output_count += 1;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
Ok(ExcelSheet {
|
|
309
|
+
name: sheet_name.to_owned(),
|
|
310
|
+
markdown,
|
|
311
|
+
row_count: bb_rows,
|
|
312
|
+
col_count: bb_cols,
|
|
313
|
+
cell_count,
|
|
314
|
+
table_cells: None, // No structured table for sparse sheets
|
|
315
|
+
})
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/// Convert a 0-indexed column number to Excel-style letter(s) (A, B, ..., Z, AA, AB, ...).
|
|
319
|
+
fn col_to_excel_letter(col: u32) -> String {
|
|
320
|
+
let mut result = String::new();
|
|
321
|
+
let mut n = col + 1; // 1-indexed for calculation
|
|
322
|
+
while n > 0 {
|
|
323
|
+
n -= 1;
|
|
324
|
+
result.insert(0, (b'A' + (n % 26) as u8) as char);
|
|
325
|
+
n /= 26;
|
|
326
|
+
}
|
|
327
|
+
result
|
|
328
|
+
}
|
|
329
|
+
|
|
121
330
|
fn process_workbook<RS, R>(mut workbook: R, office_metadata: Option<HashMap<String, String>>) -> Result<ExcelWorkbook>
|
|
122
331
|
where
|
|
123
332
|
RS: std::io::Read + std::io::Seek,
|
|
@@ -143,7 +352,10 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
|
|
|
143
352
|
let (rows, cols) = range.get_size();
|
|
144
353
|
let cell_count = range.used_cells().count();
|
|
145
354
|
|
|
146
|
-
|
|
355
|
+
// Fix for issue #331: Use actual cell count instead of declared dimensions
|
|
356
|
+
// to avoid OOM on sparse sheets with extreme dimensions (e.g., Excel Solver files).
|
|
357
|
+
// Declared dimensions can claim A1:XFD1048575 (~17T cells) while actual data is minimal.
|
|
358
|
+
let estimated_capacity = 50 + (cols * 20) + (cell_count * 12);
|
|
147
359
|
|
|
148
360
|
if rows == 0 || cols == 0 {
|
|
149
361
|
let markdown = format!("## {}\n\n*Empty sheet*", name);
|
|
@@ -176,6 +388,31 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
|
|
|
176
388
|
///
|
|
177
389
|
/// Returns (markdown, table_cells) where table_cells is a 2D vector of strings.
|
|
178
390
|
fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity: usize) -> (String, Vec<Vec<String>>) {
|
|
391
|
+
// Fix for issue #331: Protect against extreme declared dimensions.
|
|
392
|
+
// Excel Solver files can declare A1:XFD1048575 (1M+ rows) but only have ~26 actual cells.
|
|
393
|
+
// Calling range.rows().collect() would iterate ALL declared rows causing OOM.
|
|
394
|
+
const MAX_REASONABLE_ROWS: usize = 100_000; // Cap at 100K rows for safety
|
|
395
|
+
|
|
396
|
+
let (declared_rows, _declared_cols) = range.get_size();
|
|
397
|
+
|
|
398
|
+
// If declared rows exceed reasonable limit, skip processing to avoid OOM
|
|
399
|
+
if declared_rows > MAX_REASONABLE_ROWS {
|
|
400
|
+
let actual_cell_count = range.used_cells().count();
|
|
401
|
+
|
|
402
|
+
// If actual data is minimal compared to declared size, it's a sparse/pathological file
|
|
403
|
+
if actual_cell_count < 10_000 {
|
|
404
|
+
// Return minimal output instead of OOM
|
|
405
|
+
let result_capacity = 100 + sheet_name.len();
|
|
406
|
+
let mut result = String::with_capacity(result_capacity);
|
|
407
|
+
write!(
|
|
408
|
+
result,
|
|
409
|
+
"## {}\n\n*Sheet has extreme declared dimensions ({} rows) with minimal actual data ({} cells). Skipping to prevent OOM.*",
|
|
410
|
+
sheet_name, declared_rows, actual_cell_count
|
|
411
|
+
).unwrap();
|
|
412
|
+
return (result, Vec::new());
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
179
416
|
let rows: Vec<_> = range.rows().collect();
|
|
180
417
|
if rows.is_empty() {
|
|
181
418
|
let result_capacity = 50 + sheet_name.len();
|
|
@@ -384,5 +384,11 @@ pub(super) fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
|
|
|
384
384
|
}
|
|
385
385
|
}
|
|
386
386
|
|
|
387
|
+
// Sort slide paths to ensure correct ordering regardless of XML order.
|
|
388
|
+
// PowerPoint doesn't guarantee relationship order in the rels file.
|
|
389
|
+
// GitHub Issue #329: Without sorting, slides can be processed in wrong order,
|
|
390
|
+
// causing images to have incorrect page numbers.
|
|
391
|
+
slide_paths.sort();
|
|
392
|
+
|
|
387
393
|
Ok(slide_paths)
|
|
388
394
|
}
|