kreuzberg 4.1.2 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +121 -39
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +28 -12
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +57 -57
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +12 -2
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.1
|
|
20
|
+
> **🚀 Version 4.2.1 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
use std::net::{IpAddr, SocketAddr};
|
|
4
4
|
|
|
5
|
-
use crate::{ExtractionConfig, Result, core::ServerConfig};
|
|
5
|
+
use crate::{ExtractionConfig, Result, core::ServerConfig, plugins::startup_validation::validate_plugins_at_startup};
|
|
6
6
|
|
|
7
7
|
use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
|
|
8
8
|
|
|
@@ -80,6 +80,9 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
|
|
|
80
80
|
server_config.max_multipart_field_bytes,
|
|
81
81
|
);
|
|
82
82
|
|
|
83
|
+
// Validate plugins at startup
|
|
84
|
+
validate_plugins_at_startup()?;
|
|
85
|
+
|
|
83
86
|
serve_with_config_and_limits(host, port, extraction_config, limits).await
|
|
84
87
|
}
|
|
85
88
|
|
|
@@ -111,6 +114,10 @@ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: Extract
|
|
|
111
114
|
"Upload size limit: 100 MB (default, {} bytes)",
|
|
112
115
|
limits.max_request_body_bytes
|
|
113
116
|
);
|
|
117
|
+
|
|
118
|
+
// Validate plugins at startup
|
|
119
|
+
validate_plugins_at_startup()?;
|
|
120
|
+
|
|
114
121
|
serve_with_config_and_limits(host, port, config, limits).await
|
|
115
122
|
}
|
|
116
123
|
|
|
@@ -158,6 +165,9 @@ pub async fn serve_with_config_and_limits(
|
|
|
158
165
|
let addr = SocketAddr::new(ip, port);
|
|
159
166
|
let app = create_router_with_limits_and_server_config(config, limits, server_config);
|
|
160
167
|
|
|
168
|
+
// Validate plugins at startup
|
|
169
|
+
validate_plugins_at_startup()?;
|
|
170
|
+
|
|
161
171
|
tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
|
|
162
172
|
|
|
163
173
|
let listener = tokio::net::TcpListener::bind(addr)
|
|
@@ -214,6 +224,9 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
|
|
|
214
224
|
let addr = SocketAddr::new(ip, server_config.port);
|
|
215
225
|
let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
|
|
216
226
|
|
|
227
|
+
// Validate plugins at startup
|
|
228
|
+
validate_plugins_at_startup()?;
|
|
229
|
+
|
|
217
230
|
tracing::info!(
|
|
218
231
|
"Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
|
|
219
232
|
ip,
|
|
@@ -238,6 +251,7 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
|
|
|
238
251
|
/// Defaults: host = "127.0.0.1", port = 8000
|
|
239
252
|
///
|
|
240
253
|
/// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
|
|
254
|
+
/// Validates plugins at startup to help diagnose configuration issues.
|
|
241
255
|
pub async fn serve_default() -> Result<()> {
|
|
242
256
|
serve("127.0.0.1", 8000).await
|
|
243
257
|
}
|
|
@@ -30,8 +30,10 @@ const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
|
|
|
30
30
|
/// Valid tesseract OEM (OCR Engine Mode) values.
|
|
31
31
|
const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
|
|
32
32
|
|
|
33
|
-
/// Valid output formats for
|
|
34
|
-
|
|
33
|
+
/// Valid output formats for document extraction.
|
|
34
|
+
/// Supports plain text, markdown, djot, and HTML output formats.
|
|
35
|
+
/// Also accepts aliases: "text" for "plain", "md" for "markdown".
|
|
36
|
+
const VALID_OUTPUT_FORMATS: &[&str] = &["plain", "text", "markdown", "md", "djot", "html"];
|
|
35
37
|
|
|
36
38
|
/// Validate a binarization method string.
|
|
37
39
|
///
|
|
@@ -248,11 +250,17 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
|
|
|
248
250
|
}
|
|
249
251
|
}
|
|
250
252
|
|
|
251
|
-
/// Validate a
|
|
253
|
+
/// Validate a document extraction output format.
|
|
254
|
+
///
|
|
255
|
+
/// Accepts the following formats and aliases:
|
|
256
|
+
/// - "plain" or "text" for plain text output
|
|
257
|
+
/// - "markdown" or "md" for Markdown output
|
|
258
|
+
/// - "djot" for Djot markup format
|
|
259
|
+
/// - "html" for HTML output
|
|
252
260
|
///
|
|
253
261
|
/// # Arguments
|
|
254
262
|
///
|
|
255
|
-
/// * `format` - The output format to validate
|
|
263
|
+
/// * `format` - The output format to validate
|
|
256
264
|
///
|
|
257
265
|
/// # Returns
|
|
258
266
|
///
|
|
@@ -264,7 +272,11 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
|
|
|
264
272
|
/// use kreuzberg::core::config_validation::validate_output_format;
|
|
265
273
|
///
|
|
266
274
|
/// assert!(validate_output_format("text").is_ok());
|
|
275
|
+
/// assert!(validate_output_format("plain").is_ok());
|
|
267
276
|
/// assert!(validate_output_format("markdown").is_ok());
|
|
277
|
+
/// assert!(validate_output_format("md").is_ok());
|
|
278
|
+
/// assert!(validate_output_format("djot").is_ok());
|
|
279
|
+
/// assert!(validate_output_format("html").is_ok());
|
|
268
280
|
/// assert!(validate_output_format("json").is_err());
|
|
269
281
|
/// ```
|
|
270
282
|
pub fn validate_output_format(format: &str) -> Result<()> {
|
|
@@ -106,9 +106,8 @@ pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
|
|
|
106
106
|
///
|
|
107
107
|
/// # Errors
|
|
108
108
|
///
|
|
109
|
-
/// Returns `KreuzbergError::
|
|
109
|
+
/// Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
|
|
110
110
|
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
|
111
|
-
/// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
|
|
112
111
|
///
|
|
113
112
|
/// # Example
|
|
114
113
|
///
|
|
@@ -411,7 +411,8 @@ mod tests {
|
|
|
411
411
|
|
|
412
412
|
assert!(result.is_err());
|
|
413
413
|
use crate::KreuzbergError;
|
|
414
|
-
|
|
414
|
+
// File validation returns Io error, not Validation error
|
|
415
|
+
assert!(matches!(result.unwrap_err(), KreuzbergError::Io { .. }));
|
|
415
416
|
}
|
|
416
417
|
|
|
417
418
|
#[test]
|
|
@@ -61,12 +61,12 @@ pub fn file_exists(path: impl AsRef<Path>) -> bool {
|
|
|
61
61
|
///
|
|
62
62
|
/// # Errors
|
|
63
63
|
///
|
|
64
|
-
/// Returns `KreuzbergError::
|
|
64
|
+
/// Returns `KreuzbergError::Io` if file doesn't exist.
|
|
65
65
|
pub fn validate_file_exists(path: impl AsRef<Path>) -> Result<()> {
|
|
66
66
|
if !file_exists(&path) {
|
|
67
|
-
return Err(KreuzbergError::
|
|
68
|
-
|
|
69
|
-
path.as_ref().display()
|
|
67
|
+
return Err(KreuzbergError::from(std::io::Error::new(
|
|
68
|
+
std::io::ErrorKind::NotFound,
|
|
69
|
+
format!("File does not exist: {}", path.as_ref().display()),
|
|
70
70
|
)));
|
|
71
71
|
}
|
|
72
72
|
Ok(())
|
|
@@ -99,9 +99,9 @@ where
|
|
|
99
99
|
let mut files = Vec::new();
|
|
100
100
|
|
|
101
101
|
if !dir.is_dir() {
|
|
102
|
-
return Err(KreuzbergError::
|
|
103
|
-
|
|
104
|
-
dir.display()
|
|
102
|
+
return Err(KreuzbergError::from(std::io::Error::new(
|
|
103
|
+
std::io::ErrorKind::NotADirectory,
|
|
104
|
+
format!("Path is not a directory: {}", dir.display()),
|
|
105
105
|
)));
|
|
106
106
|
}
|
|
107
107
|
|
|
@@ -231,15 +231,15 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
|
|
231
231
|
///
|
|
232
232
|
/// # Errors
|
|
233
233
|
///
|
|
234
|
-
/// Returns `KreuzbergError::
|
|
234
|
+
/// Returns `KreuzbergError::Io` if file doesn't exist (when `check_exists` is true).
|
|
235
235
|
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
|
236
236
|
pub fn detect_mime_type(path: impl AsRef<Path>, check_exists: bool) -> Result<String> {
|
|
237
237
|
let path = path.as_ref();
|
|
238
238
|
|
|
239
239
|
if check_exists && !path.exists() {
|
|
240
|
-
return Err(KreuzbergError::
|
|
241
|
-
|
|
242
|
-
path.display()
|
|
240
|
+
return Err(KreuzbergError::from(std::io::Error::new(
|
|
241
|
+
std::io::ErrorKind::NotFound,
|
|
242
|
+
format!("File does not exist: {}", path.display()),
|
|
243
243
|
)));
|
|
244
244
|
}
|
|
245
245
|
|
|
@@ -219,10 +219,10 @@ pub fn get_or_init_model(
|
|
|
219
219
|
// This prevents panics that cannot unwind through FFI boundaries
|
|
220
220
|
fn ensure_onnx_available() -> Result<(), String> {
|
|
221
221
|
// Check if ORT_DYLIB_PATH is already set and valid
|
|
222
|
-
if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
222
|
+
if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
|
|
223
|
+
&& std::path::Path::new(&path).exists()
|
|
224
|
+
{
|
|
225
|
+
return Ok(());
|
|
226
226
|
}
|
|
227
227
|
|
|
228
228
|
// Check common installation paths and set ORT_DYLIB_PATH if found
|
|
@@ -384,5 +384,11 @@ pub(super) fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
|
|
|
384
384
|
}
|
|
385
385
|
}
|
|
386
386
|
|
|
387
|
+
// Sort slide paths to ensure correct ordering regardless of XML order.
|
|
388
|
+
// PowerPoint doesn't guarantee relationship order in the rels file.
|
|
389
|
+
// GitHub Issue #329: Without sorting, slides can be processed in wrong order,
|
|
390
|
+
// causing images to have incorrect page numbers.
|
|
391
|
+
slide_paths.sort();
|
|
392
|
+
|
|
387
393
|
Ok(slide_paths)
|
|
388
394
|
}
|
|
@@ -4,25 +4,72 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::{ExtractionConfig, ExtractionResult as KreuzbergResult};
|
|
6
6
|
|
|
7
|
+
/// Merge extraction configuration using JSON-level merge.
|
|
8
|
+
///
|
|
9
|
+
/// This function performs a JSON-level merge where fields present in the override
|
|
10
|
+
/// JSON take precedence over the base config. This approach correctly handles
|
|
11
|
+
/// boolean fields that are explicitly set to their default values.
|
|
12
|
+
///
|
|
13
|
+
/// # Strategy
|
|
14
|
+
///
|
|
15
|
+
/// 1. Serialize base config to JSON
|
|
16
|
+
/// 2. For each field in the override JSON, merge into base JSON (field-by-field override)
|
|
17
|
+
/// 3. Deserialize merged JSON back to ExtractionConfig
|
|
18
|
+
///
|
|
19
|
+
/// This ensures that explicitly provided values always take precedence, even if
|
|
20
|
+
/// they match the default value. Unspecified fields are preserved from base config.
|
|
21
|
+
///
|
|
22
|
+
/// # Examples
|
|
23
|
+
///
|
|
24
|
+
/// ```rust,no_run
|
|
25
|
+
/// use kreuzberg::{ExtractionConfig, OutputFormat};
|
|
26
|
+
/// use serde_json::json;
|
|
27
|
+
///
|
|
28
|
+
/// let mut base = ExtractionConfig::default();
|
|
29
|
+
/// base.use_cache = true;
|
|
30
|
+
///
|
|
31
|
+
/// let override_json = json!({
|
|
32
|
+
/// "force_ocr": true,
|
|
33
|
+
/// });
|
|
34
|
+
///
|
|
35
|
+
/// let merged = merge_configs(&base, &override_json).unwrap();
|
|
36
|
+
/// assert_eq!(merged.use_cache, true); // from base
|
|
37
|
+
/// assert_eq!(merged.force_ocr, true); // from override
|
|
38
|
+
/// ```
|
|
39
|
+
fn merge_configs(base: &ExtractionConfig, override_json: serde_json::Value) -> Result<ExtractionConfig, String> {
|
|
40
|
+
// Serialize base config to JSON
|
|
41
|
+
let mut config_json =
|
|
42
|
+
serde_json::to_value(base).map_err(|e| format!("Failed to serialize base config to JSON: {}", e))?;
|
|
43
|
+
|
|
44
|
+
// Merge JSON value into config JSON (simple field-by-field merge)
|
|
45
|
+
// For each key in the provided JSON, override the corresponding key in config JSON
|
|
46
|
+
if let serde_json::Value::Object(json_obj) = override_json
|
|
47
|
+
&& let Some(config_obj) = config_json.as_object_mut()
|
|
48
|
+
{
|
|
49
|
+
for (key, value) in json_obj {
|
|
50
|
+
config_obj.insert(key, value);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Deserialize merged JSON back to ExtractionConfig
|
|
55
|
+
serde_json::from_value(config_json).map_err(|e| format!("Failed to deserialize merged config: {}", e))
|
|
56
|
+
}
|
|
57
|
+
|
|
7
58
|
/// Build extraction config from MCP parameters.
|
|
8
59
|
///
|
|
9
|
-
///
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
output_format: None,
|
|
19
|
-
})
|
|
60
|
+
/// Merges the provided config JSON (if any) with the default config using JSON-level
|
|
61
|
+
/// merge semantics. Unspecified fields in the JSON preserve their values from the default config.
|
|
62
|
+
pub(super) fn build_config(
|
|
63
|
+
default_config: &ExtractionConfig,
|
|
64
|
+
config_json: Option<serde_json::Value>,
|
|
65
|
+
) -> Result<ExtractionConfig, String> {
|
|
66
|
+
if let Some(json) = config_json {
|
|
67
|
+
// Merge using JSON-level merge: provided JSON fields override default config
|
|
68
|
+
merge_configs(default_config, json)
|
|
20
69
|
} else {
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
config
|
|
70
|
+
// No config provided, use default
|
|
71
|
+
Ok(default_config.clone())
|
|
72
|
+
}
|
|
26
73
|
}
|
|
27
74
|
|
|
28
75
|
/// Format extraction result as human-readable text.
|
|
@@ -54,20 +101,35 @@ mod tests {
|
|
|
54
101
|
use super::*;
|
|
55
102
|
|
|
56
103
|
#[test]
|
|
57
|
-
fn
|
|
104
|
+
fn test_build_config_with_no_config() {
|
|
58
105
|
let default_config = ExtractionConfig::default();
|
|
59
106
|
|
|
60
|
-
let config = build_config(&default_config,
|
|
61
|
-
|
|
62
|
-
|
|
107
|
+
let config = build_config(&default_config, None).unwrap();
|
|
108
|
+
assert_eq!(config.use_cache, default_config.use_cache);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
#[test]
|
|
112
|
+
fn test_build_config_with_config_json() {
|
|
113
|
+
let default_config = ExtractionConfig::default();
|
|
114
|
+
let config_json = serde_json::json!({
|
|
115
|
+
"use_cache": false
|
|
116
|
+
});
|
|
63
117
|
|
|
64
|
-
let config = build_config(&default_config,
|
|
65
|
-
assert!(config.
|
|
66
|
-
|
|
118
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
119
|
+
assert!(!config.use_cache);
|
|
120
|
+
}
|
|
67
121
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
122
|
+
#[test]
|
|
123
|
+
fn test_build_config_with_invalid_config_json() {
|
|
124
|
+
let default_config = ExtractionConfig::default();
|
|
125
|
+
// Provide invalid type for a field (string instead of boolean)
|
|
126
|
+
let config_json = serde_json::json!({
|
|
127
|
+
"use_cache": "not_a_boolean"
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
let result = build_config(&default_config, Some(config_json));
|
|
131
|
+
assert!(result.is_err());
|
|
132
|
+
assert!(result.unwrap_err().contains("Failed to deserialize"));
|
|
71
133
|
}
|
|
72
134
|
|
|
73
135
|
#[test]
|
|
@@ -77,31 +139,167 @@ mod tests {
|
|
|
77
139
|
..Default::default()
|
|
78
140
|
};
|
|
79
141
|
|
|
80
|
-
let config = build_config(&default_config,
|
|
142
|
+
let config = build_config(&default_config, None).unwrap();
|
|
81
143
|
|
|
82
144
|
assert!(!config.use_cache);
|
|
83
145
|
}
|
|
84
146
|
|
|
85
147
|
#[test]
|
|
86
|
-
fn
|
|
87
|
-
let default_config = ExtractionConfig
|
|
148
|
+
fn test_build_config_overrides_default_settings() {
|
|
149
|
+
let default_config = ExtractionConfig {
|
|
150
|
+
use_cache: true,
|
|
151
|
+
..Default::default()
|
|
152
|
+
};
|
|
88
153
|
|
|
89
|
-
let
|
|
154
|
+
let config_json = serde_json::json!({
|
|
155
|
+
"use_cache": false
|
|
156
|
+
});
|
|
90
157
|
|
|
91
|
-
|
|
92
|
-
assert!(!config.
|
|
158
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
159
|
+
assert!(!config.use_cache);
|
|
93
160
|
}
|
|
94
161
|
|
|
95
162
|
#[test]
|
|
96
|
-
fn
|
|
97
|
-
|
|
163
|
+
fn test_build_config_merges_partial_config() {
|
|
164
|
+
// Base config with custom use_cache setting
|
|
165
|
+
let default_config = ExtractionConfig {
|
|
166
|
+
use_cache: false,
|
|
167
|
+
enable_quality_processing: true,
|
|
168
|
+
force_ocr: false,
|
|
169
|
+
..Default::default()
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
// Override only force_ocr
|
|
173
|
+
let config_json = serde_json::json!({
|
|
174
|
+
"force_ocr": true
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
178
|
+
|
|
179
|
+
// use_cache should be preserved from default_config
|
|
180
|
+
assert!(!config.use_cache, "use_cache should be preserved from default config");
|
|
181
|
+
// enable_quality_processing should be preserved
|
|
182
|
+
assert!(
|
|
183
|
+
config.enable_quality_processing,
|
|
184
|
+
"enable_quality_processing should be preserved"
|
|
185
|
+
);
|
|
186
|
+
// force_ocr should be overridden
|
|
187
|
+
assert!(config.force_ocr, "force_ocr should be overridden to true");
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
#[test]
|
|
191
|
+
fn test_build_config_merges_nested_config() {
|
|
192
|
+
let default_config = ExtractionConfig {
|
|
193
|
+
use_cache: true,
|
|
194
|
+
..Default::default()
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
// Override output format only
|
|
198
|
+
let config_json = serde_json::json!({
|
|
199
|
+
"output_format": "markdown"
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
203
|
+
|
|
204
|
+
// use_cache should be preserved
|
|
205
|
+
assert!(config.use_cache, "use_cache should be preserved from default config");
|
|
206
|
+
// output_format should be overridden
|
|
207
|
+
assert_eq!(
|
|
208
|
+
config.output_format,
|
|
209
|
+
crate::core::config::formats::OutputFormat::Markdown,
|
|
210
|
+
"output_format should be overridden to markdown"
|
|
211
|
+
);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
#[test]
|
|
215
|
+
fn test_build_config_merges_with_custom_defaults() {
|
|
216
|
+
// Create a default config with custom values
|
|
217
|
+
let default_config = ExtractionConfig {
|
|
218
|
+
use_cache: false,
|
|
219
|
+
enable_quality_processing: true,
|
|
220
|
+
force_ocr: false,
|
|
221
|
+
..Default::default()
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
// Provide partial override (only force_ocr)
|
|
225
|
+
let config_json = serde_json::json!({
|
|
226
|
+
"force_ocr": true,
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
230
|
+
|
|
231
|
+
// force_ocr should be overridden
|
|
232
|
+
assert!(config.force_ocr, "force_ocr should be overridden to true");
|
|
233
|
+
// use_cache should be preserved from default_config
|
|
234
|
+
assert!(
|
|
235
|
+
!config.use_cache,
|
|
236
|
+
"use_cache should be preserved from default config (false)"
|
|
237
|
+
);
|
|
238
|
+
// enable_quality_processing should be preserved
|
|
239
|
+
assert!(
|
|
240
|
+
config.enable_quality_processing,
|
|
241
|
+
"enable_quality_processing should be preserved (true)"
|
|
242
|
+
);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
#[test]
|
|
246
|
+
fn test_build_config_merges_multiple_fields() {
|
|
247
|
+
let default_config = ExtractionConfig {
|
|
248
|
+
use_cache: true,
|
|
249
|
+
enable_quality_processing: false,
|
|
250
|
+
force_ocr: true,
|
|
251
|
+
..Default::default()
|
|
252
|
+
};
|
|
253
|
+
|
|
254
|
+
// Override multiple fields
|
|
255
|
+
let config_json = serde_json::json!({
|
|
256
|
+
"use_cache": false,
|
|
257
|
+
"output_format": "markdown",
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
261
|
+
|
|
262
|
+
// use_cache should be overridden
|
|
263
|
+
assert!(!config.use_cache, "use_cache should be overridden to false");
|
|
264
|
+
// output_format should be overridden
|
|
265
|
+
assert_eq!(
|
|
266
|
+
config.output_format,
|
|
267
|
+
crate::core::config::formats::OutputFormat::Markdown,
|
|
268
|
+
"output_format should be overridden to markdown"
|
|
269
|
+
);
|
|
270
|
+
// force_ocr should be preserved (not in override)
|
|
271
|
+
assert!(
|
|
272
|
+
config.force_ocr,
|
|
273
|
+
"force_ocr should be preserved from default config (true)"
|
|
274
|
+
);
|
|
275
|
+
// enable_quality_processing should be preserved
|
|
276
|
+
assert!(
|
|
277
|
+
!config.enable_quality_processing,
|
|
278
|
+
"enable_quality_processing should be preserved (false)"
|
|
279
|
+
);
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
#[test]
|
|
283
|
+
fn test_build_config_boolean_override_to_default_value() {
|
|
284
|
+
// This test validates the critical bug fix: when user explicitly sets a boolean
|
|
285
|
+
// to its default value, the merge logic should correctly use the override value,
|
|
286
|
+
// not fall back to the base config.
|
|
287
|
+
let base = ExtractionConfig {
|
|
288
|
+
use_cache: false,
|
|
289
|
+
..Default::default()
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
// User explicitly provides use_cache: true (which IS the default)
|
|
293
|
+
let override_json = serde_json::json!({"use_cache": true});
|
|
98
294
|
|
|
99
|
-
let
|
|
295
|
+
let merged = build_config(&base, Some(override_json)).unwrap();
|
|
100
296
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
297
|
+
// Before the fix: merged.use_cache would be false (WRONG - fell back to base)
|
|
298
|
+
// After the fix: merged.use_cache should be true (CORRECT - override applied)
|
|
299
|
+
assert!(
|
|
300
|
+
merged.use_cache,
|
|
301
|
+
"Should use explicit override even if it matches default"
|
|
302
|
+
);
|
|
105
303
|
}
|
|
106
304
|
|
|
107
305
|
#[test]
|
|
@@ -12,12 +12,9 @@ pub struct ExtractFileParams {
|
|
|
12
12
|
/// Optional MIME type hint (auto-detected if not provided)
|
|
13
13
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
14
14
|
pub mime_type: Option<String>,
|
|
15
|
-
///
|
|
16
|
-
#[serde(
|
|
17
|
-
pub
|
|
18
|
-
/// Force OCR even if text extraction succeeds
|
|
19
|
-
#[serde(default)]
|
|
20
|
-
pub force_ocr: bool,
|
|
15
|
+
/// Extraction configuration (JSON object)
|
|
16
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
17
|
+
pub config: Option<serde_json::Value>,
|
|
21
18
|
/// Use async extraction (default: false for sync)
|
|
22
19
|
#[serde(default)]
|
|
23
20
|
pub r#async: bool,
|
|
@@ -31,12 +28,9 @@ pub struct ExtractBytesParams {
|
|
|
31
28
|
/// Optional MIME type hint (auto-detected if not provided)
|
|
32
29
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
33
30
|
pub mime_type: Option<String>,
|
|
34
|
-
///
|
|
35
|
-
#[serde(
|
|
36
|
-
pub
|
|
37
|
-
/// Force OCR even if text extraction succeeds
|
|
38
|
-
#[serde(default)]
|
|
39
|
-
pub force_ocr: bool,
|
|
31
|
+
/// Extraction configuration (JSON object)
|
|
32
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
33
|
+
pub config: Option<serde_json::Value>,
|
|
40
34
|
/// Use async extraction (default: false for sync)
|
|
41
35
|
#[serde(default)]
|
|
42
36
|
pub r#async: bool,
|
|
@@ -47,12 +41,9 @@ pub struct ExtractBytesParams {
|
|
|
47
41
|
pub struct BatchExtractFilesParams {
|
|
48
42
|
/// Paths to files to extract
|
|
49
43
|
pub paths: Vec<String>,
|
|
50
|
-
///
|
|
51
|
-
#[serde(
|
|
52
|
-
pub
|
|
53
|
-
/// Force OCR even if text extraction succeeds
|
|
54
|
-
#[serde(default)]
|
|
55
|
-
pub force_ocr: bool,
|
|
44
|
+
/// Extraction configuration (JSON object)
|
|
45
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
46
|
+
pub config: Option<serde_json::Value>,
|
|
56
47
|
/// Use async extraction (default: false for sync)
|
|
57
48
|
#[serde(default)]
|
|
58
49
|
pub r#async: bool,
|
|
@@ -83,8 +74,7 @@ mod tests {
|
|
|
83
74
|
|
|
84
75
|
assert_eq!(params.path, "/test.pdf");
|
|
85
76
|
assert_eq!(params.mime_type, None);
|
|
86
|
-
|
|
87
|
-
assert!(!params.force_ocr);
|
|
77
|
+
assert_eq!(params.config, None);
|
|
88
78
|
assert!(!params.r#async);
|
|
89
79
|
}
|
|
90
80
|
|
|
@@ -95,8 +85,7 @@ mod tests {
|
|
|
95
85
|
|
|
96
86
|
assert_eq!(params.data, "SGVsbG8=");
|
|
97
87
|
assert_eq!(params.mime_type, None);
|
|
98
|
-
|
|
99
|
-
assert!(!params.force_ocr);
|
|
88
|
+
assert_eq!(params.config, None);
|
|
100
89
|
assert!(!params.r#async);
|
|
101
90
|
}
|
|
102
91
|
|
|
@@ -106,8 +95,7 @@ mod tests {
|
|
|
106
95
|
let params: BatchExtractFilesParams = serde_json::from_str(json).unwrap();
|
|
107
96
|
|
|
108
97
|
assert_eq!(params.paths.len(), 2);
|
|
109
|
-
|
|
110
|
-
assert!(!params.force_ocr);
|
|
98
|
+
assert_eq!(params.config, None);
|
|
111
99
|
assert!(!params.r#async);
|
|
112
100
|
}
|
|
113
101
|
|
|
@@ -128,13 +116,21 @@ mod tests {
|
|
|
128
116
|
assert!(!params.use_content);
|
|
129
117
|
}
|
|
130
118
|
|
|
119
|
+
#[test]
|
|
120
|
+
fn test_extract_file_params_with_config() {
|
|
121
|
+
let json = r#"{"path": "/test.pdf", "config": {"use_cache": false}}"#;
|
|
122
|
+
let params: ExtractFileParams = serde_json::from_str(json).unwrap();
|
|
123
|
+
|
|
124
|
+
assert_eq!(params.path, "/test.pdf");
|
|
125
|
+
assert!(params.config.is_some());
|
|
126
|
+
}
|
|
127
|
+
|
|
131
128
|
#[test]
|
|
132
129
|
fn test_extract_file_params_serialization() {
|
|
133
130
|
let params = ExtractFileParams {
|
|
134
131
|
path: "/test.pdf".to_string(),
|
|
135
132
|
mime_type: Some("application/pdf".to_string()),
|
|
136
|
-
|
|
137
|
-
force_ocr: false,
|
|
133
|
+
config: Some(serde_json::json!({"use_cache": false})),
|
|
138
134
|
r#async: true,
|
|
139
135
|
};
|
|
140
136
|
|
|
@@ -143,8 +139,7 @@ mod tests {
|
|
|
143
139
|
|
|
144
140
|
assert_eq!(params.path, deserialized.path);
|
|
145
141
|
assert_eq!(params.mime_type, deserialized.mime_type);
|
|
146
|
-
assert_eq!(params.
|
|
147
|
-
assert_eq!(params.force_ocr, deserialized.force_ocr);
|
|
142
|
+
assert_eq!(params.config, deserialized.config);
|
|
148
143
|
assert_eq!(params.r#async, deserialized.r#async);
|
|
149
144
|
}
|
|
150
145
|
|
|
@@ -153,8 +148,7 @@ mod tests {
|
|
|
153
148
|
let params = ExtractBytesParams {
|
|
154
149
|
data: "SGVsbG8=".to_string(),
|
|
155
150
|
mime_type: None,
|
|
156
|
-
|
|
157
|
-
force_ocr: false,
|
|
151
|
+
config: None,
|
|
158
152
|
r#async: false,
|
|
159
153
|
};
|
|
160
154
|
|
|
@@ -168,8 +162,7 @@ mod tests {
|
|
|
168
162
|
fn test_batch_extract_params_serialization() {
|
|
169
163
|
let params = BatchExtractFilesParams {
|
|
170
164
|
paths: vec!["/a.pdf".to_string(), "/b.pdf".to_string()],
|
|
171
|
-
|
|
172
|
-
force_ocr: true,
|
|
165
|
+
config: Some(serde_json::json!({"use_cache": true})),
|
|
173
166
|
r#async: true,
|
|
174
167
|
};
|
|
175
168
|
|
|
@@ -177,7 +170,7 @@ mod tests {
|
|
|
177
170
|
let deserialized: BatchExtractFilesParams = serde_json::from_str(&json).unwrap();
|
|
178
171
|
|
|
179
172
|
assert_eq!(params.paths, deserialized.paths);
|
|
180
|
-
assert_eq!(params.
|
|
173
|
+
assert_eq!(params.config, deserialized.config);
|
|
181
174
|
}
|
|
182
175
|
|
|
183
176
|
#[test]
|