kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.1.2 Release**
20
+ > **🚀 Version 4.2.1 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -2,7 +2,7 @@
2
2
 
3
3
  use std::net::{IpAddr, SocketAddr};
4
4
 
5
- use crate::{ExtractionConfig, Result, core::ServerConfig};
5
+ use crate::{ExtractionConfig, Result, core::ServerConfig, plugins::startup_validation::validate_plugins_at_startup};
6
6
 
7
7
  use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
8
8
 
@@ -80,6 +80,9 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
80
80
  server_config.max_multipart_field_bytes,
81
81
  );
82
82
 
83
+ // Validate plugins at startup
84
+ validate_plugins_at_startup()?;
85
+
83
86
  serve_with_config_and_limits(host, port, extraction_config, limits).await
84
87
  }
85
88
 
@@ -111,6 +114,10 @@ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: Extract
111
114
  "Upload size limit: 100 MB (default, {} bytes)",
112
115
  limits.max_request_body_bytes
113
116
  );
117
+
118
+ // Validate plugins at startup
119
+ validate_plugins_at_startup()?;
120
+
114
121
  serve_with_config_and_limits(host, port, config, limits).await
115
122
  }
116
123
 
@@ -158,6 +165,9 @@ pub async fn serve_with_config_and_limits(
158
165
  let addr = SocketAddr::new(ip, port);
159
166
  let app = create_router_with_limits_and_server_config(config, limits, server_config);
160
167
 
168
+ // Validate plugins at startup
169
+ validate_plugins_at_startup()?;
170
+
161
171
  tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
162
172
 
163
173
  let listener = tokio::net::TcpListener::bind(addr)
@@ -214,6 +224,9 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
214
224
  let addr = SocketAddr::new(ip, server_config.port);
215
225
  let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
216
226
 
227
+ // Validate plugins at startup
228
+ validate_plugins_at_startup()?;
229
+
217
230
  tracing::info!(
218
231
  "Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
219
232
  ip,
@@ -238,6 +251,7 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
238
251
  /// Defaults: host = "127.0.0.1", port = 8000
239
252
  ///
240
253
  /// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
254
+ /// Validates plugins at startup to help diagnose configuration issues.
241
255
  pub async fn serve_default() -> Result<()> {
242
256
  serve("127.0.0.1", 8000).await
243
257
  }
@@ -30,8 +30,10 @@ const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
30
30
  /// Valid tesseract OEM (OCR Engine Mode) values.
31
31
  const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
32
32
 
33
- /// Valid output formats for tesseract.
34
- const VALID_OUTPUT_FORMATS: &[&str] = &["text", "markdown"];
33
+ /// Valid output formats for document extraction.
34
+ /// Supports plain text, markdown, djot, and HTML output formats.
35
+ /// Also accepts aliases: "text" for "plain", "md" for "markdown".
36
+ const VALID_OUTPUT_FORMATS: &[&str] = &["plain", "text", "markdown", "md", "djot", "html"];
35
37
 
36
38
  /// Validate a binarization method string.
37
39
  ///
@@ -248,11 +250,17 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
248
250
  }
249
251
  }
250
252
 
251
- /// Validate a tesseract output format.
253
+ /// Validate a document extraction output format.
254
+ ///
255
+ /// Accepts the following formats and aliases:
256
+ /// - "plain" or "text" for plain text output
257
+ /// - "markdown" or "md" for Markdown output
258
+ /// - "djot" for Djot markup format
259
+ /// - "html" for HTML output
252
260
  ///
253
261
  /// # Arguments
254
262
  ///
255
- /// * `format` - The output format to validate (e.g., "text", "markdown")
263
+ /// * `format` - The output format to validate
256
264
  ///
257
265
  /// # Returns
258
266
  ///
@@ -264,7 +272,11 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
264
272
  /// use kreuzberg::core::config_validation::validate_output_format;
265
273
  ///
266
274
  /// assert!(validate_output_format("text").is_ok());
275
+ /// assert!(validate_output_format("plain").is_ok());
267
276
  /// assert!(validate_output_format("markdown").is_ok());
277
+ /// assert!(validate_output_format("md").is_ok());
278
+ /// assert!(validate_output_format("djot").is_ok());
279
+ /// assert!(validate_output_format("html").is_ok());
268
280
  /// assert!(validate_output_format("json").is_err());
269
281
  /// ```
270
282
  pub fn validate_output_format(format: &str) -> Result<()> {
@@ -106,9 +106,8 @@ pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
106
106
  ///
107
107
  /// # Errors
108
108
  ///
109
- /// Returns `KreuzbergError::Validation` if the file doesn't exist or path is invalid.
109
+ /// Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
110
110
  /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
111
- /// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
112
111
  ///
113
112
  /// # Example
114
113
  ///
@@ -411,7 +411,8 @@ mod tests {
411
411
 
412
412
  assert!(result.is_err());
413
413
  use crate::KreuzbergError;
414
- assert!(matches!(result.unwrap_err(), KreuzbergError::Validation { .. }));
414
+ // File validation returns Io error, not Validation error
415
+ assert!(matches!(result.unwrap_err(), KreuzbergError::Io { .. }));
415
416
  }
416
417
 
417
418
  #[test]
@@ -61,12 +61,12 @@ pub fn file_exists(path: impl AsRef<Path>) -> bool {
61
61
  ///
62
62
  /// # Errors
63
63
  ///
64
- /// Returns `KreuzbergError::Validation` if file doesn't exist.
64
+ /// Returns `KreuzbergError::Io` if file doesn't exist.
65
65
  pub fn validate_file_exists(path: impl AsRef<Path>) -> Result<()> {
66
66
  if !file_exists(&path) {
67
- return Err(KreuzbergError::validation(format!(
68
- "File does not exist: {}",
69
- path.as_ref().display()
67
+ return Err(KreuzbergError::from(std::io::Error::new(
68
+ std::io::ErrorKind::NotFound,
69
+ format!("File does not exist: {}", path.as_ref().display()),
70
70
  )));
71
71
  }
72
72
  Ok(())
@@ -99,9 +99,9 @@ where
99
99
  let mut files = Vec::new();
100
100
 
101
101
  if !dir.is_dir() {
102
- return Err(KreuzbergError::validation(format!(
103
- "Path is not a directory: {}",
104
- dir.display()
102
+ return Err(KreuzbergError::from(std::io::Error::new(
103
+ std::io::ErrorKind::NotADirectory,
104
+ format!("Path is not a directory: {}", dir.display()),
105
105
  )));
106
106
  }
107
107
 
@@ -231,15 +231,15 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
231
231
  ///
232
232
  /// # Errors
233
233
  ///
234
- /// Returns `KreuzbergError::Validation` if file doesn't exist (when `check_exists` is true).
234
+ /// Returns `KreuzbergError::Io` if file doesn't exist (when `check_exists` is true).
235
235
  /// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
236
236
  pub fn detect_mime_type(path: impl AsRef<Path>, check_exists: bool) -> Result<String> {
237
237
  let path = path.as_ref();
238
238
 
239
239
  if check_exists && !path.exists() {
240
- return Err(KreuzbergError::validation(format!(
241
- "File does not exist: {}",
242
- path.display()
240
+ return Err(KreuzbergError::from(std::io::Error::new(
241
+ std::io::ErrorKind::NotFound,
242
+ format!("File does not exist: {}", path.display()),
243
243
  )));
244
244
  }
245
245
 
@@ -219,10 +219,10 @@ pub fn get_or_init_model(
219
219
  // This prevents panics that cannot unwind through FFI boundaries
220
220
  fn ensure_onnx_available() -> Result<(), String> {
221
221
  // Check if ORT_DYLIB_PATH is already set and valid
222
- if let Ok(path) = std::env::var("ORT_DYLIB_PATH") {
223
- if std::path::Path::new(&path).exists() {
224
- return Ok(());
225
- }
222
+ if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
223
+ && std::path::Path::new(&path).exists()
224
+ {
225
+ return Ok(());
226
226
  }
227
227
 
228
228
  // Check common installation paths and set ORT_DYLIB_PATH if found
@@ -384,5 +384,11 @@ pub(super) fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
384
384
  }
385
385
  }
386
386
 
387
+ // Sort slide paths to ensure correct ordering regardless of XML order.
388
+ // PowerPoint doesn't guarantee relationship order in the rels file.
389
+ // GitHub Issue #329: Without sorting, slides can be processed in wrong order,
390
+ // causing images to have incorrect page numbers.
391
+ slide_paths.sort();
392
+
387
393
  Ok(slide_paths)
388
394
  }
@@ -4,25 +4,72 @@
4
4
 
5
5
  use crate::{ExtractionConfig, ExtractionResult as KreuzbergResult};
6
6
 
7
+ /// Merge extraction configuration using JSON-level merge.
8
+ ///
9
+ /// This function performs a JSON-level merge where fields present in the override
10
+ /// JSON take precedence over the base config. This approach correctly handles
11
+ /// boolean fields that are explicitly set to their default values.
12
+ ///
13
+ /// # Strategy
14
+ ///
15
+ /// 1. Serialize base config to JSON
16
+ /// 2. For each field in the override JSON, merge into base JSON (field-by-field override)
17
+ /// 3. Deserialize merged JSON back to ExtractionConfig
18
+ ///
19
+ /// This ensures that explicitly provided values always take precedence, even if
20
+ /// they match the default value. Unspecified fields are preserved from base config.
21
+ ///
22
+ /// # Examples
23
+ ///
24
+ /// ```rust,no_run
25
+ /// use kreuzberg::{ExtractionConfig, OutputFormat};
26
+ /// use serde_json::json;
27
+ ///
28
+ /// let mut base = ExtractionConfig::default();
29
+ /// base.use_cache = true;
30
+ ///
31
+ /// let override_json = json!({
32
+ /// "force_ocr": true,
33
+ /// });
34
+ ///
35
+ /// let merged = merge_configs(&base, &override_json).unwrap();
36
+ /// assert_eq!(merged.use_cache, true); // from base
37
+ /// assert_eq!(merged.force_ocr, true); // from override
38
+ /// ```
39
+ fn merge_configs(base: &ExtractionConfig, override_json: serde_json::Value) -> Result<ExtractionConfig, String> {
40
+ // Serialize base config to JSON
41
+ let mut config_json =
42
+ serde_json::to_value(base).map_err(|e| format!("Failed to serialize base config to JSON: {}", e))?;
43
+
44
+ // Merge JSON value into config JSON (simple field-by-field merge)
45
+ // For each key in the provided JSON, override the corresponding key in config JSON
46
+ if let serde_json::Value::Object(json_obj) = override_json
47
+ && let Some(config_obj) = config_json.as_object_mut()
48
+ {
49
+ for (key, value) in json_obj {
50
+ config_obj.insert(key, value);
51
+ }
52
+ }
53
+
54
+ // Deserialize merged JSON back to ExtractionConfig
55
+ serde_json::from_value(config_json).map_err(|e| format!("Failed to deserialize merged config: {}", e))
56
+ }
57
+
7
58
  /// Build extraction config from MCP parameters.
8
59
  ///
9
- /// Starts with the default config and overlays OCR settings from request parameters.
10
- pub(super) fn build_config(default_config: &ExtractionConfig, enable_ocr: bool, force_ocr: bool) -> ExtractionConfig {
11
- let mut config = default_config.clone();
12
-
13
- config.ocr = if enable_ocr {
14
- Some(crate::OcrConfig {
15
- backend: "tesseract".to_string(),
16
- language: "eng".to_string(),
17
- tesseract_config: None,
18
- output_format: None,
19
- })
60
+ /// Merges the provided config JSON (if any) with the default config using JSON-level
61
+ /// merge semantics. Unspecified fields in the JSON preserve their values from the default config.
62
+ pub(super) fn build_config(
63
+ default_config: &ExtractionConfig,
64
+ config_json: Option<serde_json::Value>,
65
+ ) -> Result<ExtractionConfig, String> {
66
+ if let Some(json) = config_json {
67
+ // Merge using JSON-level merge: provided JSON fields override default config
68
+ merge_configs(default_config, json)
20
69
  } else {
21
- None
22
- };
23
- config.force_ocr = force_ocr;
24
-
25
- config
70
+ // No config provided, use default
71
+ Ok(default_config.clone())
72
+ }
26
73
  }
27
74
 
28
75
  /// Format extraction result as human-readable text.
@@ -54,20 +101,35 @@ mod tests {
54
101
  use super::*;
55
102
 
56
103
  #[test]
57
- fn test_build_config() {
104
+ fn test_build_config_with_no_config() {
58
105
  let default_config = ExtractionConfig::default();
59
106
 
60
- let config = build_config(&default_config, false, false);
61
- assert!(config.ocr.is_none());
62
- assert!(!config.force_ocr);
107
+ let config = build_config(&default_config, None).unwrap();
108
+ assert_eq!(config.use_cache, default_config.use_cache);
109
+ }
110
+
111
+ #[test]
112
+ fn test_build_config_with_config_json() {
113
+ let default_config = ExtractionConfig::default();
114
+ let config_json = serde_json::json!({
115
+ "use_cache": false
116
+ });
63
117
 
64
- let config = build_config(&default_config, true, false);
65
- assert!(config.ocr.is_some());
66
- assert!(!config.force_ocr);
118
+ let config = build_config(&default_config, Some(config_json)).unwrap();
119
+ assert!(!config.use_cache);
120
+ }
67
121
 
68
- let config = build_config(&default_config, true, true);
69
- assert!(config.ocr.is_some());
70
- assert!(config.force_ocr);
122
+ #[test]
123
+ fn test_build_config_with_invalid_config_json() {
124
+ let default_config = ExtractionConfig::default();
125
+ // Provide invalid type for a field (string instead of boolean)
126
+ let config_json = serde_json::json!({
127
+ "use_cache": "not_a_boolean"
128
+ });
129
+
130
+ let result = build_config(&default_config, Some(config_json));
131
+ assert!(result.is_err());
132
+ assert!(result.unwrap_err().contains("Failed to deserialize"));
71
133
  }
72
134
 
73
135
  #[test]
@@ -77,31 +139,167 @@ mod tests {
77
139
  ..Default::default()
78
140
  };
79
141
 
80
- let config = build_config(&default_config, false, false);
142
+ let config = build_config(&default_config, None).unwrap();
81
143
 
82
144
  assert!(!config.use_cache);
83
145
  }
84
146
 
85
147
  #[test]
86
- fn test_build_config_ocr_disabled_by_default() {
87
- let default_config = ExtractionConfig::default();
148
+ fn test_build_config_overrides_default_settings() {
149
+ let default_config = ExtractionConfig {
150
+ use_cache: true,
151
+ ..Default::default()
152
+ };
88
153
 
89
- let config = build_config(&default_config, false, false);
154
+ let config_json = serde_json::json!({
155
+ "use_cache": false
156
+ });
90
157
 
91
- assert!(config.ocr.is_none());
92
- assert!(!config.force_ocr);
158
+ let config = build_config(&default_config, Some(config_json)).unwrap();
159
+ assert!(!config.use_cache);
93
160
  }
94
161
 
95
162
  #[test]
96
- fn test_build_config_ocr_enabled_creates_tesseract_config() {
97
- let default_config = ExtractionConfig::default();
163
+ fn test_build_config_merges_partial_config() {
164
+ // Base config with custom use_cache setting
165
+ let default_config = ExtractionConfig {
166
+ use_cache: false,
167
+ enable_quality_processing: true,
168
+ force_ocr: false,
169
+ ..Default::default()
170
+ };
171
+
172
+ // Override only force_ocr
173
+ let config_json = serde_json::json!({
174
+ "force_ocr": true
175
+ });
176
+
177
+ let config = build_config(&default_config, Some(config_json)).unwrap();
178
+
179
+ // use_cache should be preserved from default_config
180
+ assert!(!config.use_cache, "use_cache should be preserved from default config");
181
+ // enable_quality_processing should be preserved
182
+ assert!(
183
+ config.enable_quality_processing,
184
+ "enable_quality_processing should be preserved"
185
+ );
186
+ // force_ocr should be overridden
187
+ assert!(config.force_ocr, "force_ocr should be overridden to true");
188
+ }
189
+
190
+ #[test]
191
+ fn test_build_config_merges_nested_config() {
192
+ let default_config = ExtractionConfig {
193
+ use_cache: true,
194
+ ..Default::default()
195
+ };
196
+
197
+ // Override output format only
198
+ let config_json = serde_json::json!({
199
+ "output_format": "markdown"
200
+ });
201
+
202
+ let config = build_config(&default_config, Some(config_json)).unwrap();
203
+
204
+ // use_cache should be preserved
205
+ assert!(config.use_cache, "use_cache should be preserved from default config");
206
+ // output_format should be overridden
207
+ assert_eq!(
208
+ config.output_format,
209
+ crate::core::config::formats::OutputFormat::Markdown,
210
+ "output_format should be overridden to markdown"
211
+ );
212
+ }
213
+
214
+ #[test]
215
+ fn test_build_config_merges_with_custom_defaults() {
216
+ // Create a default config with custom values
217
+ let default_config = ExtractionConfig {
218
+ use_cache: false,
219
+ enable_quality_processing: true,
220
+ force_ocr: false,
221
+ ..Default::default()
222
+ };
223
+
224
+ // Provide partial override (only force_ocr)
225
+ let config_json = serde_json::json!({
226
+ "force_ocr": true,
227
+ });
228
+
229
+ let config = build_config(&default_config, Some(config_json)).unwrap();
230
+
231
+ // force_ocr should be overridden
232
+ assert!(config.force_ocr, "force_ocr should be overridden to true");
233
+ // use_cache should be preserved from default_config
234
+ assert!(
235
+ !config.use_cache,
236
+ "use_cache should be preserved from default config (false)"
237
+ );
238
+ // enable_quality_processing should be preserved
239
+ assert!(
240
+ config.enable_quality_processing,
241
+ "enable_quality_processing should be preserved (true)"
242
+ );
243
+ }
244
+
245
+ #[test]
246
+ fn test_build_config_merges_multiple_fields() {
247
+ let default_config = ExtractionConfig {
248
+ use_cache: true,
249
+ enable_quality_processing: false,
250
+ force_ocr: true,
251
+ ..Default::default()
252
+ };
253
+
254
+ // Override multiple fields
255
+ let config_json = serde_json::json!({
256
+ "use_cache": false,
257
+ "output_format": "markdown",
258
+ });
259
+
260
+ let config = build_config(&default_config, Some(config_json)).unwrap();
261
+
262
+ // use_cache should be overridden
263
+ assert!(!config.use_cache, "use_cache should be overridden to false");
264
+ // output_format should be overridden
265
+ assert_eq!(
266
+ config.output_format,
267
+ crate::core::config::formats::OutputFormat::Markdown,
268
+ "output_format should be overridden to markdown"
269
+ );
270
+ // force_ocr should be preserved (not in override)
271
+ assert!(
272
+ config.force_ocr,
273
+ "force_ocr should be preserved from default config (true)"
274
+ );
275
+ // enable_quality_processing should be preserved
276
+ assert!(
277
+ !config.enable_quality_processing,
278
+ "enable_quality_processing should be preserved (false)"
279
+ );
280
+ }
281
+
282
+ #[test]
283
+ fn test_build_config_boolean_override_to_default_value() {
284
+ // This test validates the critical bug fix: when user explicitly sets a boolean
285
+ // to its default value, the merge logic should correctly use the override value,
286
+ // not fall back to the base config.
287
+ let base = ExtractionConfig {
288
+ use_cache: false,
289
+ ..Default::default()
290
+ };
291
+
292
+ // User explicitly provides use_cache: true (which IS the default)
293
+ let override_json = serde_json::json!({"use_cache": true});
98
294
 
99
- let config = build_config(&default_config, true, false);
295
+ let merged = build_config(&base, Some(override_json)).unwrap();
100
296
 
101
- assert!(config.ocr.is_some());
102
- let ocr_config = config.ocr.unwrap();
103
- assert_eq!(ocr_config.backend, "tesseract");
104
- assert_eq!(ocr_config.language, "eng");
297
+ // Before the fix: merged.use_cache would be false (WRONG - fell back to base)
298
+ // After the fix: merged.use_cache should be true (CORRECT - override applied)
299
+ assert!(
300
+ merged.use_cache,
301
+ "Should use explicit override even if it matches default"
302
+ );
105
303
  }
106
304
 
107
305
  #[test]
@@ -12,12 +12,9 @@ pub struct ExtractFileParams {
12
12
  /// Optional MIME type hint (auto-detected if not provided)
13
13
  #[serde(skip_serializing_if = "Option::is_none")]
14
14
  pub mime_type: Option<String>,
15
- /// Enable OCR for scanned documents
16
- #[serde(default)]
17
- pub enable_ocr: bool,
18
- /// Force OCR even if text extraction succeeds
19
- #[serde(default)]
20
- pub force_ocr: bool,
15
+ /// Extraction configuration (JSON object)
16
+ #[serde(skip_serializing_if = "Option::is_none")]
17
+ pub config: Option<serde_json::Value>,
21
18
  /// Use async extraction (default: false for sync)
22
19
  #[serde(default)]
23
20
  pub r#async: bool,
@@ -31,12 +28,9 @@ pub struct ExtractBytesParams {
31
28
  /// Optional MIME type hint (auto-detected if not provided)
32
29
  #[serde(skip_serializing_if = "Option::is_none")]
33
30
  pub mime_type: Option<String>,
34
- /// Enable OCR for scanned documents
35
- #[serde(default)]
36
- pub enable_ocr: bool,
37
- /// Force OCR even if text extraction succeeds
38
- #[serde(default)]
39
- pub force_ocr: bool,
31
+ /// Extraction configuration (JSON object)
32
+ #[serde(skip_serializing_if = "Option::is_none")]
33
+ pub config: Option<serde_json::Value>,
40
34
  /// Use async extraction (default: false for sync)
41
35
  #[serde(default)]
42
36
  pub r#async: bool,
@@ -47,12 +41,9 @@ pub struct ExtractBytesParams {
47
41
  pub struct BatchExtractFilesParams {
48
42
  /// Paths to files to extract
49
43
  pub paths: Vec<String>,
50
- /// Enable OCR for scanned documents
51
- #[serde(default)]
52
- pub enable_ocr: bool,
53
- /// Force OCR even if text extraction succeeds
54
- #[serde(default)]
55
- pub force_ocr: bool,
44
+ /// Extraction configuration (JSON object)
45
+ #[serde(skip_serializing_if = "Option::is_none")]
46
+ pub config: Option<serde_json::Value>,
56
47
  /// Use async extraction (default: false for sync)
57
48
  #[serde(default)]
58
49
  pub r#async: bool,
@@ -83,8 +74,7 @@ mod tests {
83
74
 
84
75
  assert_eq!(params.path, "/test.pdf");
85
76
  assert_eq!(params.mime_type, None);
86
- assert!(!params.enable_ocr);
87
- assert!(!params.force_ocr);
77
+ assert_eq!(params.config, None);
88
78
  assert!(!params.r#async);
89
79
  }
90
80
 
@@ -95,8 +85,7 @@ mod tests {
95
85
 
96
86
  assert_eq!(params.data, "SGVsbG8=");
97
87
  assert_eq!(params.mime_type, None);
98
- assert!(!params.enable_ocr);
99
- assert!(!params.force_ocr);
88
+ assert_eq!(params.config, None);
100
89
  assert!(!params.r#async);
101
90
  }
102
91
 
@@ -106,8 +95,7 @@ mod tests {
106
95
  let params: BatchExtractFilesParams = serde_json::from_str(json).unwrap();
107
96
 
108
97
  assert_eq!(params.paths.len(), 2);
109
- assert!(!params.enable_ocr);
110
- assert!(!params.force_ocr);
98
+ assert_eq!(params.config, None);
111
99
  assert!(!params.r#async);
112
100
  }
113
101
 
@@ -128,13 +116,21 @@ mod tests {
128
116
  assert!(!params.use_content);
129
117
  }
130
118
 
119
+ #[test]
120
+ fn test_extract_file_params_with_config() {
121
+ let json = r#"{"path": "/test.pdf", "config": {"use_cache": false}}"#;
122
+ let params: ExtractFileParams = serde_json::from_str(json).unwrap();
123
+
124
+ assert_eq!(params.path, "/test.pdf");
125
+ assert!(params.config.is_some());
126
+ }
127
+
131
128
  #[test]
132
129
  fn test_extract_file_params_serialization() {
133
130
  let params = ExtractFileParams {
134
131
  path: "/test.pdf".to_string(),
135
132
  mime_type: Some("application/pdf".to_string()),
136
- enable_ocr: true,
137
- force_ocr: false,
133
+ config: Some(serde_json::json!({"use_cache": false})),
138
134
  r#async: true,
139
135
  };
140
136
 
@@ -143,8 +139,7 @@ mod tests {
143
139
 
144
140
  assert_eq!(params.path, deserialized.path);
145
141
  assert_eq!(params.mime_type, deserialized.mime_type);
146
- assert_eq!(params.enable_ocr, deserialized.enable_ocr);
147
- assert_eq!(params.force_ocr, deserialized.force_ocr);
142
+ assert_eq!(params.config, deserialized.config);
148
143
  assert_eq!(params.r#async, deserialized.r#async);
149
144
  }
150
145
 
@@ -153,8 +148,7 @@ mod tests {
153
148
  let params = ExtractBytesParams {
154
149
  data: "SGVsbG8=".to_string(),
155
150
  mime_type: None,
156
- enable_ocr: false,
157
- force_ocr: false,
151
+ config: None,
158
152
  r#async: false,
159
153
  };
160
154
 
@@ -168,8 +162,7 @@ mod tests {
168
162
  fn test_batch_extract_params_serialization() {
169
163
  let params = BatchExtractFilesParams {
170
164
  paths: vec!["/a.pdf".to_string(), "/b.pdf".to_string()],
171
- enable_ocr: true,
172
- force_ocr: true,
165
+ config: Some(serde_json::json!({"use_cache": true})),
173
166
  r#async: true,
174
167
  };
175
168
 
@@ -177,7 +170,7 @@ mod tests {
177
170
  let deserialized: BatchExtractFilesParams = serde_json::from_str(&json).unwrap();
178
171
 
179
172
  assert_eq!(params.paths, deserialized.paths);
180
- assert_eq!(params.enable_ocr, deserialized.enable_ocr);
173
+ assert_eq!(params.config, deserialized.config);
181
174
  }
182
175
 
183
176
  #[test]