kreuzberg 4.2.1 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,141 @@
1
+ //! OpenAPI 3.1 schema generation for Kreuzberg API.
2
+ //!
3
+ //! This module generates OpenAPI documentation from Rust types using utoipa.
4
+ //! The schema is available at the `/openapi.json` endpoint.
5
+
6
+ #[cfg(feature = "api")]
7
+ use utoipa::OpenApi;
8
+
9
+ /// OpenAPI documentation structure.
10
+ ///
11
+ /// Defines all endpoints, request/response schemas, and examples
12
+ /// for the Kreuzberg document extraction API.
13
+ #[cfg(feature = "api")]
14
+ #[derive(OpenApi)]
15
+ #[openapi(
16
+ info(
17
+ title = "Kreuzberg API",
18
+ version = env!("CARGO_PKG_VERSION"),
19
+ description = "High-performance document intelligence API for extracting text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats.",
20
+ contact(
21
+ name = "Kreuzberg",
22
+ url = "https://kreuzberg.dev"
23
+ ),
24
+ license(
25
+ name = "Apache-2.0 OR MIT"
26
+ )
27
+ ),
28
+ servers(
29
+ (url = "http://localhost:8000", description = "Local development server"),
30
+ (url = "https://api.kreuzberg.dev", description = "Production server (example)")
31
+ ),
32
+ paths(
33
+ crate::api::handlers::health_handler,
34
+ crate::api::handlers::info_handler,
35
+ // Note: extract_handler omitted - requires ExtractionResult ToSchema impl
36
+ crate::api::handlers::cache_stats_handler,
37
+ crate::api::handlers::cache_clear_handler,
38
+ crate::api::handlers::embed_handler,
39
+ crate::api::handlers::chunk_handler,
40
+ ),
41
+ components(
42
+ schemas(
43
+ crate::api::types::HealthResponse,
44
+ crate::api::types::PluginStatus,
45
+ crate::api::types::InfoResponse,
46
+ crate::api::types::ErrorResponse,
47
+ crate::api::types::CacheStatsResponse,
48
+ crate::api::types::CacheClearResponse,
49
+ crate::api::types::EmbedRequest,
50
+ crate::api::types::EmbedResponse,
51
+ crate::api::types::ChunkRequest,
52
+ crate::api::types::ChunkResponse,
53
+ crate::api::types::ChunkItem,
54
+ crate::api::types::ChunkingConfigRequest,
55
+ crate::api::types::ChunkingConfigResponse,
56
+ )
57
+ ),
58
+ tags(
59
+ (name = "health", description = "Health and status endpoints"),
60
+ (name = "extraction", description = "Document extraction endpoints"),
61
+ (name = "cache", description = "Cache management endpoints"),
62
+ (name = "embeddings", description = "Text embedding generation"),
63
+ (name = "chunking", description = "Text chunking operations")
64
+ )
65
+ )]
66
+ pub struct ApiDoc;
67
+
68
+ /// Generate OpenAPI JSON schema.
69
+ ///
70
+ /// Returns the complete OpenAPI 3.1 specification as a JSON string.
71
+ ///
72
+ /// # Examples
73
+ ///
74
+ /// ```no_run
75
+ /// use kreuzberg::api::openapi::openapi_json;
76
+ ///
77
+ /// let schema = openapi_json();
78
+ /// println!("{}", schema);
79
+ /// ```
80
+ #[cfg(feature = "api")]
81
+ pub fn openapi_json() -> String {
82
+ ApiDoc::openapi().to_pretty_json().unwrap_or_else(|_| "{}".to_string())
83
+ }
84
+
85
+ #[cfg(not(feature = "api"))]
86
+ pub fn openapi_json() -> String {
87
+ r#"{"error": "API feature not enabled"}"#.to_string()
88
+ }
89
+
90
+ #[cfg(test)]
91
+ mod tests {
92
+ #[cfg(feature = "api")]
93
+ use super::*;
94
+
95
+ #[test]
96
+ #[cfg(feature = "api")]
97
+ fn test_openapi_schema_generation() {
98
+ let schema = openapi_json();
99
+ assert!(!schema.is_empty());
100
+ assert!(schema.contains("Kreuzberg API"));
101
+ assert!(schema.contains("/health"));
102
+ assert!(schema.contains("/extract"));
103
+ }
104
+
105
+ #[test]
106
+ #[cfg(feature = "api")]
107
+ fn test_openapi_schema_valid_json() {
108
+ let schema = openapi_json();
109
+ let parsed: serde_json::Value = serde_json::from_str(&schema).expect("Invalid JSON");
110
+ assert!(parsed.is_object());
111
+ assert!(parsed["openapi"].is_string());
112
+ }
113
+
114
+ #[test]
115
+ #[cfg(feature = "api")]
116
+ fn test_openapi_includes_all_endpoints() {
117
+ let schema = openapi_json();
118
+ // Health endpoints
119
+ assert!(schema.contains("/health"));
120
+ assert!(schema.contains("/info"));
121
+ // Extraction
122
+ assert!(schema.contains("/extract"));
123
+ // Cache
124
+ assert!(schema.contains("/cache/stats"));
125
+ assert!(schema.contains("/cache/clear"));
126
+ // Embeddings
127
+ assert!(schema.contains("/embed"));
128
+ // Chunking
129
+ assert!(schema.contains("/chunk"));
130
+ }
131
+
132
+ #[test]
133
+ #[cfg(feature = "api")]
134
+ fn test_openapi_includes_schemas() {
135
+ let schema = openapi_json();
136
+ assert!(schema.contains("HealthResponse"));
137
+ assert!(schema.contains("ErrorResponse"));
138
+ assert!(schema.contains("EmbedRequest"));
139
+ assert!(schema.contains("ChunkRequest"));
140
+ }
141
+ }
@@ -153,14 +153,22 @@ pub fn create_router_with_limits_and_server_config(
153
153
  }
154
154
  };
155
155
 
156
- Router::new()
156
+ let mut router = Router::new()
157
157
  .route("/extract", post(extract_handler))
158
158
  .route("/embed", post(embed_handler))
159
159
  .route("/chunk", post(chunk_handler))
160
160
  .route("/health", get(health_handler))
161
161
  .route("/info", get(info_handler))
162
162
  .route("/cache/stats", get(cache_stats_handler))
163
- .route("/cache/clear", delete(cache_clear_handler))
163
+ .route("/cache/clear", delete(cache_clear_handler));
164
+
165
+ // Add OpenAPI schema endpoint if API feature is enabled
166
+ #[cfg(feature = "api")]
167
+ {
168
+ router = router.route("/openapi.json", get(openapi_schema_handler));
169
+ }
170
+
171
+ router
164
172
  .layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
165
173
  .layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
166
174
  .layer(cors_layer)
@@ -168,6 +176,20 @@ pub fn create_router_with_limits_and_server_config(
168
176
  .with_state(state)
169
177
  }
170
178
 
179
+ /// OpenAPI schema handler.
180
+ ///
181
+ /// Returns the OpenAPI 3.1 JSON schema for all documented endpoints.
182
+ #[cfg(feature = "api")]
183
+ async fn openapi_schema_handler() -> axum::Json<serde_json::Value> {
184
+ use crate::api::openapi::openapi_json;
185
+
186
+ let schema_str = openapi_json();
187
+ let schema: serde_json::Value = serde_json::from_str(&schema_str)
188
+ .unwrap_or_else(|_| serde_json::json!({"error": "Failed to generate OpenAPI schema"}));
189
+
190
+ axum::Json(schema)
191
+ }
192
+
171
193
  #[cfg(test)]
172
194
  mod tests {
173
195
  use super::*;
@@ -2,7 +2,9 @@
2
2
 
3
3
  use std::net::{IpAddr, SocketAddr};
4
4
 
5
- use crate::{ExtractionConfig, Result, core::ServerConfig, plugins::startup_validation::validate_plugins_at_startup};
5
+ use crate::{
6
+ ExtractionConfig, Result, core::ServerConfig, extractors, plugins::startup_validation::validate_plugins_at_startup,
7
+ };
6
8
 
7
9
  use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
8
10
 
@@ -80,7 +82,8 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
80
82
  server_config.max_multipart_field_bytes,
81
83
  );
82
84
 
83
- // Validate plugins at startup
85
+ // Initialize extractors and validate plugins at startup
86
+ extractors::ensure_initialized()?;
84
87
  validate_plugins_at_startup()?;
85
88
 
86
89
  serve_with_config_and_limits(host, port, extraction_config, limits).await
@@ -115,7 +118,8 @@ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: Extract
115
118
  limits.max_request_body_bytes
116
119
  );
117
120
 
118
- // Validate plugins at startup
121
+ // Initialize extractors and validate plugins at startup
122
+ extractors::ensure_initialized()?;
119
123
  validate_plugins_at_startup()?;
120
124
 
121
125
  serve_with_config_and_limits(host, port, config, limits).await
@@ -165,7 +169,8 @@ pub async fn serve_with_config_and_limits(
165
169
  let addr = SocketAddr::new(ip, port);
166
170
  let app = create_router_with_limits_and_server_config(config, limits, server_config);
167
171
 
168
- // Validate plugins at startup
172
+ // Initialize extractors and validate plugins at startup
173
+ extractors::ensure_initialized()?;
169
174
  validate_plugins_at_startup()?;
170
175
 
171
176
  tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
@@ -224,7 +229,8 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
224
229
  let addr = SocketAddr::new(ip, server_config.port);
225
230
  let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
226
231
 
227
- // Validate plugins at startup
232
+ // Initialize extractors and validate plugins at startup
233
+ extractors::ensure_initialized()?;
228
234
  validate_plugins_at_startup()?;
229
235
 
230
236
  tracing::info!(
@@ -109,19 +109,41 @@ impl ApiSizeLimits {
109
109
  }
110
110
  }
111
111
 
112
+ /// Plugin status information in health response.
113
+ #[derive(Debug, Clone, Serialize, Deserialize)]
114
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
115
+ pub struct PluginStatus {
116
+ /// Number of registered OCR backends
117
+ pub ocr_backends_count: usize,
118
+ /// Names of registered OCR backends
119
+ pub ocr_backends: Vec<String>,
120
+ /// Number of registered document extractors
121
+ pub extractors_count: usize,
122
+ /// Number of registered post-processors
123
+ pub post_processors_count: usize,
124
+ }
125
+
112
126
  /// Health check response.
113
127
  #[derive(Debug, Clone, Serialize, Deserialize)]
128
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
114
129
  pub struct HealthResponse {
115
130
  /// Health status
131
+ #[cfg_attr(feature = "api", schema(example = "healthy"))]
116
132
  pub status: String,
117
133
  /// API version
134
+ #[cfg_attr(feature = "api", schema(example = "0.8.0"))]
118
135
  pub version: String,
136
+ /// Plugin status (optional)
137
+ #[serde(skip_serializing_if = "Option::is_none")]
138
+ pub plugins: Option<PluginStatus>,
119
139
  }
120
140
 
121
141
  /// Server information response.
122
142
  #[derive(Debug, Clone, Serialize, Deserialize)]
143
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
123
144
  pub struct InfoResponse {
124
145
  /// API version
146
+ #[cfg_attr(feature = "api", schema(example = "0.8.0"))]
125
147
  pub version: String,
126
148
  /// Whether using Rust backend
127
149
  pub rust_backend: bool,
@@ -132,15 +154,19 @@ pub type ExtractResponse = Vec<ExtractionResult>;
132
154
 
133
155
  /// Error response.
134
156
  #[derive(Debug, Clone, Serialize, Deserialize)]
157
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
135
158
  pub struct ErrorResponse {
136
159
  /// Error type name
160
+ #[cfg_attr(feature = "api", schema(example = "ValidationError"))]
137
161
  pub error_type: String,
138
162
  /// Error message
163
+ #[cfg_attr(feature = "api", schema(example = "Invalid input provided"))]
139
164
  pub message: String,
140
165
  /// Stack trace (if available)
141
166
  #[serde(skip_serializing_if = "Option::is_none")]
142
167
  pub traceback: Option<String>,
143
168
  /// HTTP status code
169
+ #[cfg_attr(feature = "api", schema(example = 400))]
144
170
  pub status_code: u16,
145
171
  }
146
172
 
@@ -156,8 +182,10 @@ pub struct ApiState {
156
182
 
157
183
  /// Cache statistics response.
158
184
  #[derive(Debug, Clone, Serialize, Deserialize)]
185
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
159
186
  pub struct CacheStatsResponse {
160
187
  /// Cache directory path
188
+ #[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
161
189
  pub directory: String,
162
190
  /// Total number of cache files
163
191
  pub total_files: usize,
@@ -173,8 +201,10 @@ pub struct CacheStatsResponse {
173
201
 
174
202
  /// Cache clear response.
175
203
  #[derive(Debug, Clone, Serialize, Deserialize)]
204
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
176
205
  pub struct CacheClearResponse {
177
206
  /// Cache directory path
207
+ #[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
178
208
  pub directory: String,
179
209
  /// Number of files removed
180
210
  pub removed_files: usize,
@@ -184,20 +214,25 @@ pub struct CacheClearResponse {
184
214
 
185
215
  /// Embedding request for generating embeddings from text.
186
216
  #[derive(Debug, Clone, Serialize, Deserialize)]
217
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
187
218
  pub struct EmbedRequest {
188
- /// Text strings to generate embeddings for
219
+ /// Text strings to generate embeddings for (at least one non-empty string required)
220
+ #[cfg_attr(feature = "api", schema(min_items = 1))]
189
221
  pub texts: Vec<String>,
190
222
  /// Optional embedding configuration (model, batch size, etc.)
191
223
  #[serde(skip_serializing_if = "Option::is_none")]
224
+ #[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
192
225
  pub config: Option<crate::core::config::EmbeddingConfig>,
193
226
  }
194
227
 
195
228
  /// Embedding response containing generated embeddings.
196
229
  #[derive(Debug, Clone, Serialize, Deserialize)]
230
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
197
231
  pub struct EmbedResponse {
198
232
  /// Generated embeddings (one per input text)
199
233
  pub embeddings: Vec<Vec<f32>>,
200
234
  /// Model used for embedding generation
235
+ #[cfg_attr(feature = "api", schema(example = "all-MiniLM-L6-v2"))]
201
236
  pub model: String,
202
237
  /// Dimensionality of the embeddings
203
238
  pub dimensions: usize,
@@ -212,23 +247,29 @@ fn default_chunker_type() -> String {
212
247
 
213
248
  /// Chunk request with text and configuration.
214
249
  #[derive(Debug, Clone, Serialize, Deserialize)]
250
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
215
251
  pub struct ChunkRequest {
216
- /// Text to chunk
252
+ /// Text to chunk (must not be empty)
253
+ #[cfg_attr(feature = "api", schema(example = "This is sample text to chunk.", min_length = 1))]
217
254
  pub text: String,
218
255
  /// Optional chunking configuration
219
256
  #[serde(skip_serializing_if = "Option::is_none")]
220
257
  pub config: Option<ChunkingConfigRequest>,
221
258
  /// Chunker type (text or markdown)
222
259
  #[serde(default = "default_chunker_type")]
260
+ #[cfg_attr(feature = "api", schema(example = "text", pattern = "^(text|markdown)$"))]
223
261
  pub chunker_type: String,
224
262
  }
225
263
 
226
264
  /// Chunking configuration request.
227
265
  #[derive(Debug, Clone, Default, Serialize, Deserialize)]
266
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
228
267
  pub struct ChunkingConfigRequest {
229
- /// Maximum characters per chunk
268
+ /// Maximum characters per chunk (must be greater than overlap, default: 2000)
269
+ #[cfg_attr(feature = "api", schema(minimum = 101, example = 2000))]
230
270
  pub max_characters: Option<usize>,
231
- /// Overlap between chunks in characters
271
+ /// Overlap between chunks in characters (must be less than max_characters, default: 100)
272
+ #[cfg_attr(feature = "api", schema(minimum = 0, maximum = 1999, example = 100))]
232
273
  pub overlap: Option<usize>,
233
274
  /// Whether to trim whitespace
234
275
  pub trim: Option<bool>,
@@ -236,6 +277,7 @@ pub struct ChunkingConfigRequest {
236
277
 
237
278
  /// Chunk response with chunks and metadata.
238
279
  #[derive(Debug, Clone, Serialize, Deserialize)]
280
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
239
281
  pub struct ChunkResponse {
240
282
  /// List of chunks
241
283
  pub chunks: Vec<ChunkItem>,
@@ -246,11 +288,13 @@ pub struct ChunkResponse {
246
288
  /// Input text size in bytes
247
289
  pub input_size_bytes: usize,
248
290
  /// Chunker type used for chunking
291
+ #[cfg_attr(feature = "api", schema(example = "text"))]
249
292
  pub chunker_type: String,
250
293
  }
251
294
 
252
295
  /// Individual chunk item with metadata.
253
296
  #[derive(Debug, Clone, Serialize, Deserialize)]
297
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
254
298
  pub struct ChunkItem {
255
299
  /// Chunk content
256
300
  pub content: String,
@@ -272,6 +316,7 @@ pub struct ChunkItem {
272
316
 
273
317
  /// Chunking configuration response.
274
318
  #[derive(Debug, Clone, Serialize, Deserialize)]
319
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
275
320
  pub struct ChunkingConfigResponse {
276
321
  /// Maximum characters per chunk
277
322
  pub max_characters: usize,
@@ -280,5 +325,6 @@ pub struct ChunkingConfigResponse {
280
325
  /// Whether whitespace was trimmed
281
326
  pub trim: bool,
282
327
  /// Type of chunker used
328
+ #[cfg_attr(feature = "api", schema(example = "text"))]
283
329
  pub chunker_type: String,
284
330
  }
@@ -84,7 +84,8 @@ pub struct ChunkingConfig {
84
84
  /// Requires the `embeddings` feature to be enabled.
85
85
  #[derive(Debug, Clone, Serialize, Deserialize)]
86
86
  pub struct EmbeddingConfig {
87
- /// The embedding model to use
87
+ /// The embedding model to use (defaults to "balanced" preset if not specified)
88
+ #[serde(default = "default_model")]
88
89
  pub model: EmbeddingModelType,
89
90
 
90
91
  /// Whether to normalize embedding vectors (recommended for cosine similarity)
@@ -156,6 +157,12 @@ fn default_batch_size() -> usize {
156
157
  32
157
158
  }
158
159
 
160
+ fn default_model() -> EmbeddingModelType {
161
+ EmbeddingModelType::Preset {
162
+ name: "balanced".to_string(),
163
+ }
164
+ }
165
+
159
166
  #[cfg(test)]
160
167
  mod tests {
161
168
  use super::*;