kreuzberg 4.2.0 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/lib/kreuzberg/cli.rb +16 -6
  6. data/lib/kreuzberg/cli_proxy.rb +3 -1
  7. data/lib/kreuzberg/config.rb +59 -28
  8. data/lib/kreuzberg/djot_content.rb +225 -0
  9. data/lib/kreuzberg/extraction_api.rb +20 -4
  10. data/lib/kreuzberg/result.rb +12 -2
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +1 -0
  13. data/sig/kreuzberg.rbs +23 -11
  14. data/spec/binding/batch_spec.rb +6 -5
  15. data/spec/binding/config_spec.rb +1 -1
  16. data/spec/binding/error_recovery_spec.rb +3 -3
  17. data/spec/binding/tables_spec.rb +11 -2
  18. data/spec/unit/config/extraction_config_spec.rb +2 -2
  19. data/spec/unit/config/output_format_spec.rb +18 -18
  20. data/vendor/Cargo.toml +1 -1
  21. data/vendor/kreuzberg/Cargo.toml +3 -2
  22. data/vendor/kreuzberg/README.md +1 -1
  23. data/vendor/kreuzberg/src/api/error.rs +60 -0
  24. data/vendor/kreuzberg/src/api/handlers.rs +153 -32
  25. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  26. data/vendor/kreuzberg/src/api/openapi.rs +141 -0
  27. data/vendor/kreuzberg/src/api/router.rs +24 -2
  28. data/vendor/kreuzberg/src/api/startup.rs +21 -1
  29. data/vendor/kreuzberg/src/api/types.rs +50 -4
  30. data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
  31. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  32. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  33. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  34. data/vendor/kreuzberg/src/core/io.rs +7 -7
  35. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  36. data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
  37. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
  45. data/vendor/kreuzberg/tests/core_integration.rs +2 -4
  46. data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
  47. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
  48. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
  49. data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
  50. data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
  51. data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
  52. data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
  53. data/vendor/kreuzberg-ffi/src/types.rs +8 -5
  54. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  55. metadata +5 -2
@@ -8,13 +8,60 @@ use axum::{
8
8
  use crate::{batch_extract_bytes, cache, extract_bytes};
9
9
 
10
10
  use super::{
11
- error::ApiError,
11
+ error::{ApiError, JsonApi},
12
12
  types::{
13
13
  ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest, EmbedResponse,
14
14
  ExtractResponse, HealthResponse, InfoResponse,
15
15
  },
16
16
  };
17
17
 
18
+ /// Health check endpoint handler.
19
+ ///
20
+ /// GET /health
21
+ #[utoipa::path(
22
+ get,
23
+ path = "/health",
24
+ tag = "health",
25
+ responses(
26
+ (status = 200, description = "Service is healthy", body = HealthResponse),
27
+ )
28
+ )]
29
+ #[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
30
+ pub async fn health_handler() -> Json<HealthResponse> {
31
+ // Get plugin status
32
+ let plugin_status = crate::plugins::startup_validation::PluginHealthStatus::check();
33
+
34
+ Json(HealthResponse {
35
+ status: "healthy".to_string(),
36
+ version: env!("CARGO_PKG_VERSION").to_string(),
37
+ plugins: Some(super::types::PluginStatus {
38
+ ocr_backends_count: plugin_status.ocr_backends_count,
39
+ ocr_backends: plugin_status.ocr_backends,
40
+ extractors_count: plugin_status.extractors_count,
41
+ post_processors_count: plugin_status.post_processors_count,
42
+ }),
43
+ })
44
+ }
45
+
46
+ /// Server info endpoint handler.
47
+ ///
48
+ /// GET /info
49
+ #[utoipa::path(
50
+ get,
51
+ path = "/info",
52
+ tag = "health",
53
+ responses(
54
+ (status = 200, description = "Server information", body = InfoResponse),
55
+ )
56
+ )]
57
+ #[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
58
+ pub async fn info_handler() -> Json<InfoResponse> {
59
+ Json(InfoResponse {
60
+ version: env!("CARGO_PKG_VERSION").to_string(),
61
+ rust_backend: true,
62
+ })
63
+ }
64
+
18
65
  /// Extract endpoint handler.
19
66
  ///
20
67
  /// POST /extract
@@ -37,6 +84,19 @@ use super::{
37
84
  ///
38
85
  /// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
39
86
  /// is used as the base, and any per-request config overrides those defaults.
87
+ // TODO: Add utoipa::path annotation once ExtractionResult implements ToSchema
88
+ // #[utoipa::path(
89
+ // post,
90
+ // path = "/extract",
91
+ // tag = "extraction",
92
+ // request_body(content_type = "multipart/form-data"),
93
+ // responses(
94
+ // (status = 200, description = "Extraction successful", body = ExtractResponse),
95
+ // (status = 400, description = "Bad request", body = crate::api::types::ErrorResponse),
96
+ // (status = 413, description = "Payload too large", body = crate::api::types::ErrorResponse),
97
+ // (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
98
+ // )
99
+ // )]
40
100
  #[cfg_attr(
41
101
  feature = "otel",
42
102
  tracing::instrument(
@@ -132,28 +192,6 @@ pub async fn extract_handler(
132
192
  Ok(Json(results))
133
193
  }
134
194
 
135
- /// Health check endpoint handler.
136
- ///
137
- /// GET /health
138
- #[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
139
- pub async fn health_handler() -> Json<HealthResponse> {
140
- Json(HealthResponse {
141
- status: "healthy".to_string(),
142
- version: env!("CARGO_PKG_VERSION").to_string(),
143
- })
144
- }
145
-
146
- /// Server info endpoint handler.
147
- ///
148
- /// GET /info
149
- #[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
150
- pub async fn info_handler() -> Json<InfoResponse> {
151
- Json(InfoResponse {
152
- version: env!("CARGO_PKG_VERSION").to_string(),
153
- rust_backend: true,
154
- })
155
- }
156
-
157
195
  /// Cache stats endpoint handler.
158
196
  ///
159
197
  /// GET /cache/stats
@@ -164,6 +202,15 @@ pub async fn info_handler() -> Json<InfoResponse> {
164
202
  /// - Current directory cannot be determined
165
203
  /// - Cache directory path contains non-UTF8 characters
166
204
  /// - Cache metadata retrieval fails
205
+ #[utoipa::path(
206
+ get,
207
+ path = "/cache/stats",
208
+ tag = "cache",
209
+ responses(
210
+ (status = 200, description = "Cache statistics", body = CacheStatsResponse),
211
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
212
+ )
213
+ )]
167
214
  #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
168
215
  pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
169
216
  let cache_dir = std::env::current_dir()
@@ -204,6 +251,15 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
204
251
  /// - Current directory cannot be determined
205
252
  /// - Cache directory path contains non-UTF8 characters
206
253
  /// - Cache clearing operation fails
254
+ #[utoipa::path(
255
+ delete,
256
+ path = "/cache/clear",
257
+ tag = "cache",
258
+ responses(
259
+ (status = 200, description = "Cache cleared", body = CacheClearResponse),
260
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
261
+ )
262
+ )]
207
263
  #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
208
264
  pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
209
265
  let cache_dir = std::env::current_dir()
@@ -248,6 +304,18 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
248
304
  /// - ONNX Runtime is not available
249
305
  /// - Model initialization fails
250
306
  /// - Embedding generation fails
307
+ #[utoipa::path(
308
+ post,
309
+ path = "/embed",
310
+ tag = "embeddings",
311
+ request_body = EmbedRequest,
312
+ responses(
313
+ (status = 200, description = "Embeddings generated", body = EmbedResponse),
314
+ (status = 400, description = "Bad request - validation failed (e.g., empty texts array)", body = crate::api::types::ErrorResponse),
315
+ (status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
316
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
317
+ )
318
+ )]
251
319
  #[cfg(feature = "embeddings")]
252
320
  #[cfg_attr(
253
321
  feature = "otel",
@@ -260,7 +328,7 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
260
328
  )
261
329
  )
262
330
  )]
263
- pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
331
+ pub async fn embed_handler(JsonApi(request): JsonApi<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
264
332
  use crate::types::{Chunk, ChunkMetadata};
265
333
 
266
334
  if request.texts.is_empty() {
@@ -269,6 +337,13 @@ pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<Emb
269
337
  )));
270
338
  }
271
339
 
340
+ // Validate that no texts are empty
341
+ if request.texts.iter().any(|t| t.is_empty()) {
342
+ return Err(ApiError::validation(crate::error::KreuzbergError::validation(
343
+ "All text entries must be non-empty strings",
344
+ )));
345
+ }
346
+
272
347
  // Use default config if none provided
273
348
  let config = request.config.unwrap_or_default();
274
349
 
@@ -331,8 +406,20 @@ pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<Emb
331
406
  /// Embedding endpoint handler (when embeddings feature is disabled).
332
407
  ///
333
408
  /// Returns an error indicating embeddings feature is not enabled.
409
+ #[utoipa::path(
410
+ post,
411
+ path = "/embed",
412
+ tag = "embeddings",
413
+ request_body = EmbedRequest,
414
+ responses(
415
+ (status = 200, description = "Embeddings generated", body = EmbedResponse),
416
+ (status = 400, description = "Bad request - validation failed (e.g., empty texts array)", body = crate::api::types::ErrorResponse),
417
+ (status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
418
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
419
+ )
420
+ )]
334
421
  #[cfg(not(feature = "embeddings"))]
335
- pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
422
+ pub async fn embed_handler(JsonApi(_request): JsonApi<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
336
423
  Err(ApiError::internal(crate::error::KreuzbergError::MissingDependency(
337
424
  "Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
338
425
  )))
@@ -344,6 +431,18 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
344
431
  ///
345
432
  /// Accepts JSON body with text and optional configuration.
346
433
  /// Returns chunks with metadata.
434
+ #[utoipa::path(
435
+ post,
436
+ path = "/chunk",
437
+ tag = "chunking",
438
+ request_body = ChunkRequest,
439
+ responses(
440
+ (status = 200, description = "Text chunked successfully", body = ChunkResponse),
441
+ (status = 400, description = "Bad request - validation failed (e.g., empty text)", body = crate::api::types::ErrorResponse),
442
+ (status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
443
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
444
+ )
445
+ )]
347
446
  #[cfg_attr(
348
447
  feature = "otel",
349
448
  tracing::instrument(
@@ -352,7 +451,7 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
352
451
  fields(text_length = request.text.len(), chunker_type = request.chunker_type.as_str())
353
452
  )
354
453
  )]
355
- pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<ChunkResponse>, ApiError> {
454
+ pub async fn chunk_handler(JsonApi(request): JsonApi<ChunkRequest>) -> Result<Json<ChunkResponse>, ApiError> {
356
455
  use super::types::{ChunkItem, ChunkingConfigResponse};
357
456
  use crate::chunking::{ChunkerType, ChunkingConfig, chunk_text};
358
457
 
@@ -363,9 +462,9 @@ pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<Chu
363
462
  )));
364
463
  }
365
464
 
366
- // Parse chunker_type
465
+ // Parse chunker_type (empty string is invalid, use default by omitting the field)
367
466
  let chunker_type = match request.chunker_type.to_lowercase().as_str() {
368
- "text" | "" => ChunkerType::Text,
467
+ "text" => ChunkerType::Text,
369
468
  "markdown" => ChunkerType::Markdown,
370
469
  other => {
371
470
  return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
@@ -377,15 +476,37 @@ pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<Chu
377
476
 
378
477
  // Build config with defaults
379
478
  let cfg = request.config.unwrap_or_default();
479
+ let max_characters = cfg.max_characters.unwrap_or(2000);
480
+ let overlap = cfg.overlap.unwrap_or(100);
481
+
482
+ // Validate chunking configuration
483
+ if overlap >= max_characters {
484
+ return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
485
+ "Invalid chunking configuration: overlap ({}) must be less than max_characters ({})",
486
+ overlap, max_characters
487
+ ))));
488
+ }
489
+
380
490
  let config = ChunkingConfig {
381
- max_characters: cfg.max_characters.unwrap_or(2000),
382
- overlap: cfg.overlap.unwrap_or(100),
491
+ max_characters,
492
+ overlap,
383
493
  trim: cfg.trim.unwrap_or(true),
384
494
  chunker_type,
385
495
  };
386
496
 
387
- // Perform chunking
388
- let result = chunk_text(&request.text, &config, None).map_err(ApiError::internal)?;
497
+ // Perform chunking - convert any remaining errors to validation errors since they're likely config issues
498
+ let result = chunk_text(&request.text, &config, None).map_err(|e| {
499
+ // Check if error message indicates a configuration issue
500
+ let msg = e.to_string();
501
+ if msg.contains("configuration") || msg.contains("overlap") || msg.contains("capacity") {
502
+ ApiError::validation(crate::error::KreuzbergError::validation(format!(
503
+ "Invalid chunking configuration: {}",
504
+ msg
505
+ )))
506
+ } else {
507
+ ApiError::internal(e)
508
+ }
509
+ })?;
389
510
 
390
511
  // Transform to response
391
512
  let chunks = result
@@ -87,6 +87,8 @@
87
87
  mod config;
88
88
  mod error;
89
89
  mod handlers;
90
+ #[cfg(feature = "api")]
91
+ pub mod openapi;
90
92
  mod router;
91
93
  mod startup;
92
94
  mod types;
@@ -0,0 +1,141 @@
1
+ //! OpenAPI 3.1 schema generation for Kreuzberg API.
2
+ //!
3
+ //! This module generates OpenAPI documentation from Rust types using utoipa.
4
+ //! The schema is available at the `/openapi.json` endpoint.
5
+
6
+ #[cfg(feature = "api")]
7
+ use utoipa::OpenApi;
8
+
9
+ /// OpenAPI documentation structure.
10
+ ///
11
+ /// Defines all endpoints, request/response schemas, and examples
12
+ /// for the Kreuzberg document extraction API.
13
+ #[cfg(feature = "api")]
14
+ #[derive(OpenApi)]
15
+ #[openapi(
16
+ info(
17
+ title = "Kreuzberg API",
18
+ version = env!("CARGO_PKG_VERSION"),
19
+ description = "High-performance document intelligence API for extracting text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats.",
20
+ contact(
21
+ name = "Kreuzberg",
22
+ url = "https://kreuzberg.dev"
23
+ ),
24
+ license(
25
+ name = "Apache-2.0 OR MIT"
26
+ )
27
+ ),
28
+ servers(
29
+ (url = "http://localhost:8000", description = "Local development server"),
30
+ (url = "https://api.kreuzberg.dev", description = "Production server (example)")
31
+ ),
32
+ paths(
33
+ crate::api::handlers::health_handler,
34
+ crate::api::handlers::info_handler,
35
+ // Note: extract_handler omitted - requires ExtractionResult ToSchema impl
36
+ crate::api::handlers::cache_stats_handler,
37
+ crate::api::handlers::cache_clear_handler,
38
+ crate::api::handlers::embed_handler,
39
+ crate::api::handlers::chunk_handler,
40
+ ),
41
+ components(
42
+ schemas(
43
+ crate::api::types::HealthResponse,
44
+ crate::api::types::PluginStatus,
45
+ crate::api::types::InfoResponse,
46
+ crate::api::types::ErrorResponse,
47
+ crate::api::types::CacheStatsResponse,
48
+ crate::api::types::CacheClearResponse,
49
+ crate::api::types::EmbedRequest,
50
+ crate::api::types::EmbedResponse,
51
+ crate::api::types::ChunkRequest,
52
+ crate::api::types::ChunkResponse,
53
+ crate::api::types::ChunkItem,
54
+ crate::api::types::ChunkingConfigRequest,
55
+ crate::api::types::ChunkingConfigResponse,
56
+ )
57
+ ),
58
+ tags(
59
+ (name = "health", description = "Health and status endpoints"),
60
+ (name = "extraction", description = "Document extraction endpoints"),
61
+ (name = "cache", description = "Cache management endpoints"),
62
+ (name = "embeddings", description = "Text embedding generation"),
63
+ (name = "chunking", description = "Text chunking operations")
64
+ )
65
+ )]
66
+ pub struct ApiDoc;
67
+
68
+ /// Generate OpenAPI JSON schema.
69
+ ///
70
+ /// Returns the complete OpenAPI 3.1 specification as a JSON string.
71
+ ///
72
+ /// # Examples
73
+ ///
74
+ /// ```no_run
75
+ /// use kreuzberg::api::openapi::openapi_json;
76
+ ///
77
+ /// let schema = openapi_json();
78
+ /// println!("{}", schema);
79
+ /// ```
80
+ #[cfg(feature = "api")]
81
+ pub fn openapi_json() -> String {
82
+ ApiDoc::openapi().to_pretty_json().unwrap_or_else(|_| "{}".to_string())
83
+ }
84
+
85
+ #[cfg(not(feature = "api"))]
86
+ pub fn openapi_json() -> String {
87
+ r#"{"error": "API feature not enabled"}"#.to_string()
88
+ }
89
+
90
+ #[cfg(test)]
91
+ mod tests {
92
+ #[cfg(feature = "api")]
93
+ use super::*;
94
+
95
+ #[test]
96
+ #[cfg(feature = "api")]
97
+ fn test_openapi_schema_generation() {
98
+ let schema = openapi_json();
99
+ assert!(!schema.is_empty());
100
+ assert!(schema.contains("Kreuzberg API"));
101
+ assert!(schema.contains("/health"));
102
+ assert!(schema.contains("/extract"));
103
+ }
104
+
105
+ #[test]
106
+ #[cfg(feature = "api")]
107
+ fn test_openapi_schema_valid_json() {
108
+ let schema = openapi_json();
109
+ let parsed: serde_json::Value = serde_json::from_str(&schema).expect("Invalid JSON");
110
+ assert!(parsed.is_object());
111
+ assert!(parsed["openapi"].is_string());
112
+ }
113
+
114
+ #[test]
115
+ #[cfg(feature = "api")]
116
+ fn test_openapi_includes_all_endpoints() {
117
+ let schema = openapi_json();
118
+ // Health endpoints
119
+ assert!(schema.contains("/health"));
120
+ assert!(schema.contains("/info"));
121
+ // Extraction
122
+ assert!(schema.contains("/extract"));
123
+ // Cache
124
+ assert!(schema.contains("/cache/stats"));
125
+ assert!(schema.contains("/cache/clear"));
126
+ // Embeddings
127
+ assert!(schema.contains("/embed"));
128
+ // Chunking
129
+ assert!(schema.contains("/chunk"));
130
+ }
131
+
132
+ #[test]
133
+ #[cfg(feature = "api")]
134
+ fn test_openapi_includes_schemas() {
135
+ let schema = openapi_json();
136
+ assert!(schema.contains("HealthResponse"));
137
+ assert!(schema.contains("ErrorResponse"));
138
+ assert!(schema.contains("EmbedRequest"));
139
+ assert!(schema.contains("ChunkRequest"));
140
+ }
141
+ }
@@ -153,14 +153,22 @@ pub fn create_router_with_limits_and_server_config(
153
153
  }
154
154
  };
155
155
 
156
- Router::new()
156
+ let mut router = Router::new()
157
157
  .route("/extract", post(extract_handler))
158
158
  .route("/embed", post(embed_handler))
159
159
  .route("/chunk", post(chunk_handler))
160
160
  .route("/health", get(health_handler))
161
161
  .route("/info", get(info_handler))
162
162
  .route("/cache/stats", get(cache_stats_handler))
163
- .route("/cache/clear", delete(cache_clear_handler))
163
+ .route("/cache/clear", delete(cache_clear_handler));
164
+
165
+ // Add OpenAPI schema endpoint if API feature is enabled
166
+ #[cfg(feature = "api")]
167
+ {
168
+ router = router.route("/openapi.json", get(openapi_schema_handler));
169
+ }
170
+
171
+ router
164
172
  .layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
165
173
  .layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
166
174
  .layer(cors_layer)
@@ -168,6 +176,20 @@ pub fn create_router_with_limits_and_server_config(
168
176
  .with_state(state)
169
177
  }
170
178
 
179
+ /// OpenAPI schema handler.
180
+ ///
181
+ /// Returns the OpenAPI 3.1 JSON schema for all documented endpoints.
182
+ #[cfg(feature = "api")]
183
+ async fn openapi_schema_handler() -> axum::Json<serde_json::Value> {
184
+ use crate::api::openapi::openapi_json;
185
+
186
+ let schema_str = openapi_json();
187
+ let schema: serde_json::Value = serde_json::from_str(&schema_str)
188
+ .unwrap_or_else(|_| serde_json::json!({"error": "Failed to generate OpenAPI schema"}));
189
+
190
+ axum::Json(schema)
191
+ }
192
+
171
193
  #[cfg(test)]
172
194
  mod tests {
173
195
  use super::*;
@@ -2,7 +2,9 @@
2
2
 
3
3
  use std::net::{IpAddr, SocketAddr};
4
4
 
5
- use crate::{ExtractionConfig, Result, core::ServerConfig};
5
+ use crate::{
6
+ ExtractionConfig, Result, core::ServerConfig, extractors, plugins::startup_validation::validate_plugins_at_startup,
7
+ };
6
8
 
7
9
  use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
8
10
 
@@ -80,6 +82,10 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
80
82
  server_config.max_multipart_field_bytes,
81
83
  );
82
84
 
85
+ // Initialize extractors and validate plugins at startup
86
+ extractors::ensure_initialized()?;
87
+ validate_plugins_at_startup()?;
88
+
83
89
  serve_with_config_and_limits(host, port, extraction_config, limits).await
84
90
  }
85
91
 
@@ -111,6 +117,11 @@ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: Extract
111
117
  "Upload size limit: 100 MB (default, {} bytes)",
112
118
  limits.max_request_body_bytes
113
119
  );
120
+
121
+ // Initialize extractors and validate plugins at startup
122
+ extractors::ensure_initialized()?;
123
+ validate_plugins_at_startup()?;
124
+
114
125
  serve_with_config_and_limits(host, port, config, limits).await
115
126
  }
116
127
 
@@ -158,6 +169,10 @@ pub async fn serve_with_config_and_limits(
158
169
  let addr = SocketAddr::new(ip, port);
159
170
  let app = create_router_with_limits_and_server_config(config, limits, server_config);
160
171
 
172
+ // Initialize extractors and validate plugins at startup
173
+ extractors::ensure_initialized()?;
174
+ validate_plugins_at_startup()?;
175
+
161
176
  tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
162
177
 
163
178
  let listener = tokio::net::TcpListener::bind(addr)
@@ -214,6 +229,10 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
214
229
  let addr = SocketAddr::new(ip, server_config.port);
215
230
  let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
216
231
 
232
+ // Initialize extractors and validate plugins at startup
233
+ extractors::ensure_initialized()?;
234
+ validate_plugins_at_startup()?;
235
+
217
236
  tracing::info!(
218
237
  "Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
219
238
  ip,
@@ -238,6 +257,7 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
238
257
  /// Defaults: host = "127.0.0.1", port = 8000
239
258
  ///
240
259
  /// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
260
+ /// Validates plugins at startup to help diagnose configuration issues.
241
261
  pub async fn serve_default() -> Result<()> {
242
262
  serve("127.0.0.1", 8000).await
243
263
  }