kreuzberg 4.2.0 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +59 -28
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +23 -11
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/config_spec.rb +1 -1
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/spec/unit/config/output_format_spec.rb +18 -18
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +60 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +21 -1
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
- data/vendor/kreuzberg/tests/core_integration.rs +2 -4
- data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +5 -2
|
@@ -8,13 +8,60 @@ use axum::{
|
|
|
8
8
|
use crate::{batch_extract_bytes, cache, extract_bytes};
|
|
9
9
|
|
|
10
10
|
use super::{
|
|
11
|
-
error::ApiError,
|
|
11
|
+
error::{ApiError, JsonApi},
|
|
12
12
|
types::{
|
|
13
13
|
ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest, EmbedResponse,
|
|
14
14
|
ExtractResponse, HealthResponse, InfoResponse,
|
|
15
15
|
},
|
|
16
16
|
};
|
|
17
17
|
|
|
18
|
+
/// Health check endpoint handler.
|
|
19
|
+
///
|
|
20
|
+
/// GET /health
|
|
21
|
+
#[utoipa::path(
|
|
22
|
+
get,
|
|
23
|
+
path = "/health",
|
|
24
|
+
tag = "health",
|
|
25
|
+
responses(
|
|
26
|
+
(status = 200, description = "Service is healthy", body = HealthResponse),
|
|
27
|
+
)
|
|
28
|
+
)]
|
|
29
|
+
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
|
|
30
|
+
pub async fn health_handler() -> Json<HealthResponse> {
|
|
31
|
+
// Get plugin status
|
|
32
|
+
let plugin_status = crate::plugins::startup_validation::PluginHealthStatus::check();
|
|
33
|
+
|
|
34
|
+
Json(HealthResponse {
|
|
35
|
+
status: "healthy".to_string(),
|
|
36
|
+
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
37
|
+
plugins: Some(super::types::PluginStatus {
|
|
38
|
+
ocr_backends_count: plugin_status.ocr_backends_count,
|
|
39
|
+
ocr_backends: plugin_status.ocr_backends,
|
|
40
|
+
extractors_count: plugin_status.extractors_count,
|
|
41
|
+
post_processors_count: plugin_status.post_processors_count,
|
|
42
|
+
}),
|
|
43
|
+
})
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/// Server info endpoint handler.
|
|
47
|
+
///
|
|
48
|
+
/// GET /info
|
|
49
|
+
#[utoipa::path(
|
|
50
|
+
get,
|
|
51
|
+
path = "/info",
|
|
52
|
+
tag = "health",
|
|
53
|
+
responses(
|
|
54
|
+
(status = 200, description = "Server information", body = InfoResponse),
|
|
55
|
+
)
|
|
56
|
+
)]
|
|
57
|
+
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
|
|
58
|
+
pub async fn info_handler() -> Json<InfoResponse> {
|
|
59
|
+
Json(InfoResponse {
|
|
60
|
+
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
61
|
+
rust_backend: true,
|
|
62
|
+
})
|
|
63
|
+
}
|
|
64
|
+
|
|
18
65
|
/// Extract endpoint handler.
|
|
19
66
|
///
|
|
20
67
|
/// POST /extract
|
|
@@ -37,6 +84,19 @@ use super::{
|
|
|
37
84
|
///
|
|
38
85
|
/// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
|
|
39
86
|
/// is used as the base, and any per-request config overrides those defaults.
|
|
87
|
+
// TODO: Add utoipa::path annotation once ExtractionResult implements ToSchema
|
|
88
|
+
// #[utoipa::path(
|
|
89
|
+
// post,
|
|
90
|
+
// path = "/extract",
|
|
91
|
+
// tag = "extraction",
|
|
92
|
+
// request_body(content_type = "multipart/form-data"),
|
|
93
|
+
// responses(
|
|
94
|
+
// (status = 200, description = "Extraction successful", body = ExtractResponse),
|
|
95
|
+
// (status = 400, description = "Bad request", body = crate::api::types::ErrorResponse),
|
|
96
|
+
// (status = 413, description = "Payload too large", body = crate::api::types::ErrorResponse),
|
|
97
|
+
// (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
98
|
+
// )
|
|
99
|
+
// )]
|
|
40
100
|
#[cfg_attr(
|
|
41
101
|
feature = "otel",
|
|
42
102
|
tracing::instrument(
|
|
@@ -132,28 +192,6 @@ pub async fn extract_handler(
|
|
|
132
192
|
Ok(Json(results))
|
|
133
193
|
}
|
|
134
194
|
|
|
135
|
-
/// Health check endpoint handler.
|
|
136
|
-
///
|
|
137
|
-
/// GET /health
|
|
138
|
-
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
|
|
139
|
-
pub async fn health_handler() -> Json<HealthResponse> {
|
|
140
|
-
Json(HealthResponse {
|
|
141
|
-
status: "healthy".to_string(),
|
|
142
|
-
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
143
|
-
})
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
/// Server info endpoint handler.
|
|
147
|
-
///
|
|
148
|
-
/// GET /info
|
|
149
|
-
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
|
|
150
|
-
pub async fn info_handler() -> Json<InfoResponse> {
|
|
151
|
-
Json(InfoResponse {
|
|
152
|
-
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
153
|
-
rust_backend: true,
|
|
154
|
-
})
|
|
155
|
-
}
|
|
156
|
-
|
|
157
195
|
/// Cache stats endpoint handler.
|
|
158
196
|
///
|
|
159
197
|
/// GET /cache/stats
|
|
@@ -164,6 +202,15 @@ pub async fn info_handler() -> Json<InfoResponse> {
|
|
|
164
202
|
/// - Current directory cannot be determined
|
|
165
203
|
/// - Cache directory path contains non-UTF8 characters
|
|
166
204
|
/// - Cache metadata retrieval fails
|
|
205
|
+
#[utoipa::path(
|
|
206
|
+
get,
|
|
207
|
+
path = "/cache/stats",
|
|
208
|
+
tag = "cache",
|
|
209
|
+
responses(
|
|
210
|
+
(status = 200, description = "Cache statistics", body = CacheStatsResponse),
|
|
211
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
212
|
+
)
|
|
213
|
+
)]
|
|
167
214
|
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
|
|
168
215
|
pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
|
|
169
216
|
let cache_dir = std::env::current_dir()
|
|
@@ -204,6 +251,15 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
|
|
|
204
251
|
/// - Current directory cannot be determined
|
|
205
252
|
/// - Cache directory path contains non-UTF8 characters
|
|
206
253
|
/// - Cache clearing operation fails
|
|
254
|
+
#[utoipa::path(
|
|
255
|
+
delete,
|
|
256
|
+
path = "/cache/clear",
|
|
257
|
+
tag = "cache",
|
|
258
|
+
responses(
|
|
259
|
+
(status = 200, description = "Cache cleared", body = CacheClearResponse),
|
|
260
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
261
|
+
)
|
|
262
|
+
)]
|
|
207
263
|
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
|
|
208
264
|
pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
|
|
209
265
|
let cache_dir = std::env::current_dir()
|
|
@@ -248,6 +304,18 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
|
|
|
248
304
|
/// - ONNX Runtime is not available
|
|
249
305
|
/// - Model initialization fails
|
|
250
306
|
/// - Embedding generation fails
|
|
307
|
+
#[utoipa::path(
|
|
308
|
+
post,
|
|
309
|
+
path = "/embed",
|
|
310
|
+
tag = "embeddings",
|
|
311
|
+
request_body = EmbedRequest,
|
|
312
|
+
responses(
|
|
313
|
+
(status = 200, description = "Embeddings generated", body = EmbedResponse),
|
|
314
|
+
(status = 400, description = "Bad request - validation failed (e.g., empty texts array)", body = crate::api::types::ErrorResponse),
|
|
315
|
+
(status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
|
|
316
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
317
|
+
)
|
|
318
|
+
)]
|
|
251
319
|
#[cfg(feature = "embeddings")]
|
|
252
320
|
#[cfg_attr(
|
|
253
321
|
feature = "otel",
|
|
@@ -260,7 +328,7 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
|
|
|
260
328
|
)
|
|
261
329
|
)
|
|
262
330
|
)]
|
|
263
|
-
pub async fn embed_handler(
|
|
331
|
+
pub async fn embed_handler(JsonApi(request): JsonApi<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
|
|
264
332
|
use crate::types::{Chunk, ChunkMetadata};
|
|
265
333
|
|
|
266
334
|
if request.texts.is_empty() {
|
|
@@ -269,6 +337,13 @@ pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<Emb
|
|
|
269
337
|
)));
|
|
270
338
|
}
|
|
271
339
|
|
|
340
|
+
// Validate that no texts are empty
|
|
341
|
+
if request.texts.iter().any(|t| t.is_empty()) {
|
|
342
|
+
return Err(ApiError::validation(crate::error::KreuzbergError::validation(
|
|
343
|
+
"All text entries must be non-empty strings",
|
|
344
|
+
)));
|
|
345
|
+
}
|
|
346
|
+
|
|
272
347
|
// Use default config if none provided
|
|
273
348
|
let config = request.config.unwrap_or_default();
|
|
274
349
|
|
|
@@ -331,8 +406,20 @@ pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<Emb
|
|
|
331
406
|
/// Embedding endpoint handler (when embeddings feature is disabled).
|
|
332
407
|
///
|
|
333
408
|
/// Returns an error indicating embeddings feature is not enabled.
|
|
409
|
+
#[utoipa::path(
|
|
410
|
+
post,
|
|
411
|
+
path = "/embed",
|
|
412
|
+
tag = "embeddings",
|
|
413
|
+
request_body = EmbedRequest,
|
|
414
|
+
responses(
|
|
415
|
+
(status = 200, description = "Embeddings generated", body = EmbedResponse),
|
|
416
|
+
(status = 400, description = "Bad request - validation failed (e.g., empty texts array)", body = crate::api::types::ErrorResponse),
|
|
417
|
+
(status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
|
|
418
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
419
|
+
)
|
|
420
|
+
)]
|
|
334
421
|
#[cfg(not(feature = "embeddings"))]
|
|
335
|
-
pub async fn embed_handler(
|
|
422
|
+
pub async fn embed_handler(JsonApi(_request): JsonApi<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
|
|
336
423
|
Err(ApiError::internal(crate::error::KreuzbergError::MissingDependency(
|
|
337
424
|
"Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
|
|
338
425
|
)))
|
|
@@ -344,6 +431,18 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
|
|
|
344
431
|
///
|
|
345
432
|
/// Accepts JSON body with text and optional configuration.
|
|
346
433
|
/// Returns chunks with metadata.
|
|
434
|
+
#[utoipa::path(
|
|
435
|
+
post,
|
|
436
|
+
path = "/chunk",
|
|
437
|
+
tag = "chunking",
|
|
438
|
+
request_body = ChunkRequest,
|
|
439
|
+
responses(
|
|
440
|
+
(status = 200, description = "Text chunked successfully", body = ChunkResponse),
|
|
441
|
+
(status = 400, description = "Bad request - validation failed (e.g., empty text)", body = crate::api::types::ErrorResponse),
|
|
442
|
+
(status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
|
|
443
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
444
|
+
)
|
|
445
|
+
)]
|
|
347
446
|
#[cfg_attr(
|
|
348
447
|
feature = "otel",
|
|
349
448
|
tracing::instrument(
|
|
@@ -352,7 +451,7 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
|
|
|
352
451
|
fields(text_length = request.text.len(), chunker_type = request.chunker_type.as_str())
|
|
353
452
|
)
|
|
354
453
|
)]
|
|
355
|
-
pub async fn chunk_handler(
|
|
454
|
+
pub async fn chunk_handler(JsonApi(request): JsonApi<ChunkRequest>) -> Result<Json<ChunkResponse>, ApiError> {
|
|
356
455
|
use super::types::{ChunkItem, ChunkingConfigResponse};
|
|
357
456
|
use crate::chunking::{ChunkerType, ChunkingConfig, chunk_text};
|
|
358
457
|
|
|
@@ -363,9 +462,9 @@ pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<Chu
|
|
|
363
462
|
)));
|
|
364
463
|
}
|
|
365
464
|
|
|
366
|
-
// Parse chunker_type
|
|
465
|
+
// Parse chunker_type (empty string is invalid, use default by omitting the field)
|
|
367
466
|
let chunker_type = match request.chunker_type.to_lowercase().as_str() {
|
|
368
|
-
"text"
|
|
467
|
+
"text" => ChunkerType::Text,
|
|
369
468
|
"markdown" => ChunkerType::Markdown,
|
|
370
469
|
other => {
|
|
371
470
|
return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
@@ -377,15 +476,37 @@ pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<Chu
|
|
|
377
476
|
|
|
378
477
|
// Build config with defaults
|
|
379
478
|
let cfg = request.config.unwrap_or_default();
|
|
479
|
+
let max_characters = cfg.max_characters.unwrap_or(2000);
|
|
480
|
+
let overlap = cfg.overlap.unwrap_or(100);
|
|
481
|
+
|
|
482
|
+
// Validate chunking configuration
|
|
483
|
+
if overlap >= max_characters {
|
|
484
|
+
return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
485
|
+
"Invalid chunking configuration: overlap ({}) must be less than max_characters ({})",
|
|
486
|
+
overlap, max_characters
|
|
487
|
+
))));
|
|
488
|
+
}
|
|
489
|
+
|
|
380
490
|
let config = ChunkingConfig {
|
|
381
|
-
max_characters
|
|
382
|
-
overlap
|
|
491
|
+
max_characters,
|
|
492
|
+
overlap,
|
|
383
493
|
trim: cfg.trim.unwrap_or(true),
|
|
384
494
|
chunker_type,
|
|
385
495
|
};
|
|
386
496
|
|
|
387
|
-
// Perform chunking
|
|
388
|
-
let result = chunk_text(&request.text, &config, None).map_err(
|
|
497
|
+
// Perform chunking - convert any remaining errors to validation errors since they're likely config issues
|
|
498
|
+
let result = chunk_text(&request.text, &config, None).map_err(|e| {
|
|
499
|
+
// Check if error message indicates a configuration issue
|
|
500
|
+
let msg = e.to_string();
|
|
501
|
+
if msg.contains("configuration") || msg.contains("overlap") || msg.contains("capacity") {
|
|
502
|
+
ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
503
|
+
"Invalid chunking configuration: {}",
|
|
504
|
+
msg
|
|
505
|
+
)))
|
|
506
|
+
} else {
|
|
507
|
+
ApiError::internal(e)
|
|
508
|
+
}
|
|
509
|
+
})?;
|
|
389
510
|
|
|
390
511
|
// Transform to response
|
|
391
512
|
let chunks = result
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
//! OpenAPI 3.1 schema generation for Kreuzberg API.
|
|
2
|
+
//!
|
|
3
|
+
//! This module generates OpenAPI documentation from Rust types using utoipa.
|
|
4
|
+
//! The schema is available at the `/openapi.json` endpoint.
|
|
5
|
+
|
|
6
|
+
#[cfg(feature = "api")]
|
|
7
|
+
use utoipa::OpenApi;
|
|
8
|
+
|
|
9
|
+
/// OpenAPI documentation structure.
|
|
10
|
+
///
|
|
11
|
+
/// Defines all endpoints, request/response schemas, and examples
|
|
12
|
+
/// for the Kreuzberg document extraction API.
|
|
13
|
+
#[cfg(feature = "api")]
|
|
14
|
+
#[derive(OpenApi)]
|
|
15
|
+
#[openapi(
|
|
16
|
+
info(
|
|
17
|
+
title = "Kreuzberg API",
|
|
18
|
+
version = env!("CARGO_PKG_VERSION"),
|
|
19
|
+
description = "High-performance document intelligence API for extracting text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats.",
|
|
20
|
+
contact(
|
|
21
|
+
name = "Kreuzberg",
|
|
22
|
+
url = "https://kreuzberg.dev"
|
|
23
|
+
),
|
|
24
|
+
license(
|
|
25
|
+
name = "Apache-2.0 OR MIT"
|
|
26
|
+
)
|
|
27
|
+
),
|
|
28
|
+
servers(
|
|
29
|
+
(url = "http://localhost:8000", description = "Local development server"),
|
|
30
|
+
(url = "https://api.kreuzberg.dev", description = "Production server (example)")
|
|
31
|
+
),
|
|
32
|
+
paths(
|
|
33
|
+
crate::api::handlers::health_handler,
|
|
34
|
+
crate::api::handlers::info_handler,
|
|
35
|
+
// Note: extract_handler omitted - requires ExtractionResult ToSchema impl
|
|
36
|
+
crate::api::handlers::cache_stats_handler,
|
|
37
|
+
crate::api::handlers::cache_clear_handler,
|
|
38
|
+
crate::api::handlers::embed_handler,
|
|
39
|
+
crate::api::handlers::chunk_handler,
|
|
40
|
+
),
|
|
41
|
+
components(
|
|
42
|
+
schemas(
|
|
43
|
+
crate::api::types::HealthResponse,
|
|
44
|
+
crate::api::types::PluginStatus,
|
|
45
|
+
crate::api::types::InfoResponse,
|
|
46
|
+
crate::api::types::ErrorResponse,
|
|
47
|
+
crate::api::types::CacheStatsResponse,
|
|
48
|
+
crate::api::types::CacheClearResponse,
|
|
49
|
+
crate::api::types::EmbedRequest,
|
|
50
|
+
crate::api::types::EmbedResponse,
|
|
51
|
+
crate::api::types::ChunkRequest,
|
|
52
|
+
crate::api::types::ChunkResponse,
|
|
53
|
+
crate::api::types::ChunkItem,
|
|
54
|
+
crate::api::types::ChunkingConfigRequest,
|
|
55
|
+
crate::api::types::ChunkingConfigResponse,
|
|
56
|
+
)
|
|
57
|
+
),
|
|
58
|
+
tags(
|
|
59
|
+
(name = "health", description = "Health and status endpoints"),
|
|
60
|
+
(name = "extraction", description = "Document extraction endpoints"),
|
|
61
|
+
(name = "cache", description = "Cache management endpoints"),
|
|
62
|
+
(name = "embeddings", description = "Text embedding generation"),
|
|
63
|
+
(name = "chunking", description = "Text chunking operations")
|
|
64
|
+
)
|
|
65
|
+
)]
|
|
66
|
+
pub struct ApiDoc;
|
|
67
|
+
|
|
68
|
+
/// Generate OpenAPI JSON schema.
|
|
69
|
+
///
|
|
70
|
+
/// Returns the complete OpenAPI 3.1 specification as a JSON string.
|
|
71
|
+
///
|
|
72
|
+
/// # Examples
|
|
73
|
+
///
|
|
74
|
+
/// ```no_run
|
|
75
|
+
/// use kreuzberg::api::openapi::openapi_json;
|
|
76
|
+
///
|
|
77
|
+
/// let schema = openapi_json();
|
|
78
|
+
/// println!("{}", schema);
|
|
79
|
+
/// ```
|
|
80
|
+
#[cfg(feature = "api")]
|
|
81
|
+
pub fn openapi_json() -> String {
|
|
82
|
+
ApiDoc::openapi().to_pretty_json().unwrap_or_else(|_| "{}".to_string())
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
#[cfg(not(feature = "api"))]
|
|
86
|
+
pub fn openapi_json() -> String {
|
|
87
|
+
r#"{"error": "API feature not enabled"}"#.to_string()
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[cfg(test)]
|
|
91
|
+
mod tests {
|
|
92
|
+
#[cfg(feature = "api")]
|
|
93
|
+
use super::*;
|
|
94
|
+
|
|
95
|
+
#[test]
|
|
96
|
+
#[cfg(feature = "api")]
|
|
97
|
+
fn test_openapi_schema_generation() {
|
|
98
|
+
let schema = openapi_json();
|
|
99
|
+
assert!(!schema.is_empty());
|
|
100
|
+
assert!(schema.contains("Kreuzberg API"));
|
|
101
|
+
assert!(schema.contains("/health"));
|
|
102
|
+
assert!(schema.contains("/extract"));
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#[test]
|
|
106
|
+
#[cfg(feature = "api")]
|
|
107
|
+
fn test_openapi_schema_valid_json() {
|
|
108
|
+
let schema = openapi_json();
|
|
109
|
+
let parsed: serde_json::Value = serde_json::from_str(&schema).expect("Invalid JSON");
|
|
110
|
+
assert!(parsed.is_object());
|
|
111
|
+
assert!(parsed["openapi"].is_string());
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
#[test]
|
|
115
|
+
#[cfg(feature = "api")]
|
|
116
|
+
fn test_openapi_includes_all_endpoints() {
|
|
117
|
+
let schema = openapi_json();
|
|
118
|
+
// Health endpoints
|
|
119
|
+
assert!(schema.contains("/health"));
|
|
120
|
+
assert!(schema.contains("/info"));
|
|
121
|
+
// Extraction
|
|
122
|
+
assert!(schema.contains("/extract"));
|
|
123
|
+
// Cache
|
|
124
|
+
assert!(schema.contains("/cache/stats"));
|
|
125
|
+
assert!(schema.contains("/cache/clear"));
|
|
126
|
+
// Embeddings
|
|
127
|
+
assert!(schema.contains("/embed"));
|
|
128
|
+
// Chunking
|
|
129
|
+
assert!(schema.contains("/chunk"));
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
#[test]
|
|
133
|
+
#[cfg(feature = "api")]
|
|
134
|
+
fn test_openapi_includes_schemas() {
|
|
135
|
+
let schema = openapi_json();
|
|
136
|
+
assert!(schema.contains("HealthResponse"));
|
|
137
|
+
assert!(schema.contains("ErrorResponse"));
|
|
138
|
+
assert!(schema.contains("EmbedRequest"));
|
|
139
|
+
assert!(schema.contains("ChunkRequest"));
|
|
140
|
+
}
|
|
141
|
+
}
|
|
@@ -153,14 +153,22 @@ pub fn create_router_with_limits_and_server_config(
|
|
|
153
153
|
}
|
|
154
154
|
};
|
|
155
155
|
|
|
156
|
-
Router::new()
|
|
156
|
+
let mut router = Router::new()
|
|
157
157
|
.route("/extract", post(extract_handler))
|
|
158
158
|
.route("/embed", post(embed_handler))
|
|
159
159
|
.route("/chunk", post(chunk_handler))
|
|
160
160
|
.route("/health", get(health_handler))
|
|
161
161
|
.route("/info", get(info_handler))
|
|
162
162
|
.route("/cache/stats", get(cache_stats_handler))
|
|
163
|
-
.route("/cache/clear", delete(cache_clear_handler))
|
|
163
|
+
.route("/cache/clear", delete(cache_clear_handler));
|
|
164
|
+
|
|
165
|
+
// Add OpenAPI schema endpoint if API feature is enabled
|
|
166
|
+
#[cfg(feature = "api")]
|
|
167
|
+
{
|
|
168
|
+
router = router.route("/openapi.json", get(openapi_schema_handler));
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
router
|
|
164
172
|
.layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
|
|
165
173
|
.layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
|
|
166
174
|
.layer(cors_layer)
|
|
@@ -168,6 +176,20 @@ pub fn create_router_with_limits_and_server_config(
|
|
|
168
176
|
.with_state(state)
|
|
169
177
|
}
|
|
170
178
|
|
|
179
|
+
/// OpenAPI schema handler.
|
|
180
|
+
///
|
|
181
|
+
/// Returns the OpenAPI 3.1 JSON schema for all documented endpoints.
|
|
182
|
+
#[cfg(feature = "api")]
|
|
183
|
+
async fn openapi_schema_handler() -> axum::Json<serde_json::Value> {
|
|
184
|
+
use crate::api::openapi::openapi_json;
|
|
185
|
+
|
|
186
|
+
let schema_str = openapi_json();
|
|
187
|
+
let schema: serde_json::Value = serde_json::from_str(&schema_str)
|
|
188
|
+
.unwrap_or_else(|_| serde_json::json!({"error": "Failed to generate OpenAPI schema"}));
|
|
189
|
+
|
|
190
|
+
axum::Json(schema)
|
|
191
|
+
}
|
|
192
|
+
|
|
171
193
|
#[cfg(test)]
|
|
172
194
|
mod tests {
|
|
173
195
|
use super::*;
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
use std::net::{IpAddr, SocketAddr};
|
|
4
4
|
|
|
5
|
-
use crate::{
|
|
5
|
+
use crate::{
|
|
6
|
+
ExtractionConfig, Result, core::ServerConfig, extractors, plugins::startup_validation::validate_plugins_at_startup,
|
|
7
|
+
};
|
|
6
8
|
|
|
7
9
|
use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
|
|
8
10
|
|
|
@@ -80,6 +82,10 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
|
|
|
80
82
|
server_config.max_multipart_field_bytes,
|
|
81
83
|
);
|
|
82
84
|
|
|
85
|
+
// Initialize extractors and validate plugins at startup
|
|
86
|
+
extractors::ensure_initialized()?;
|
|
87
|
+
validate_plugins_at_startup()?;
|
|
88
|
+
|
|
83
89
|
serve_with_config_and_limits(host, port, extraction_config, limits).await
|
|
84
90
|
}
|
|
85
91
|
|
|
@@ -111,6 +117,11 @@ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: Extract
|
|
|
111
117
|
"Upload size limit: 100 MB (default, {} bytes)",
|
|
112
118
|
limits.max_request_body_bytes
|
|
113
119
|
);
|
|
120
|
+
|
|
121
|
+
// Initialize extractors and validate plugins at startup
|
|
122
|
+
extractors::ensure_initialized()?;
|
|
123
|
+
validate_plugins_at_startup()?;
|
|
124
|
+
|
|
114
125
|
serve_with_config_and_limits(host, port, config, limits).await
|
|
115
126
|
}
|
|
116
127
|
|
|
@@ -158,6 +169,10 @@ pub async fn serve_with_config_and_limits(
|
|
|
158
169
|
let addr = SocketAddr::new(ip, port);
|
|
159
170
|
let app = create_router_with_limits_and_server_config(config, limits, server_config);
|
|
160
171
|
|
|
172
|
+
// Initialize extractors and validate plugins at startup
|
|
173
|
+
extractors::ensure_initialized()?;
|
|
174
|
+
validate_plugins_at_startup()?;
|
|
175
|
+
|
|
161
176
|
tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
|
|
162
177
|
|
|
163
178
|
let listener = tokio::net::TcpListener::bind(addr)
|
|
@@ -214,6 +229,10 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
|
|
|
214
229
|
let addr = SocketAddr::new(ip, server_config.port);
|
|
215
230
|
let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
|
|
216
231
|
|
|
232
|
+
// Initialize extractors and validate plugins at startup
|
|
233
|
+
extractors::ensure_initialized()?;
|
|
234
|
+
validate_plugins_at_startup()?;
|
|
235
|
+
|
|
217
236
|
tracing::info!(
|
|
218
237
|
"Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
|
|
219
238
|
ip,
|
|
@@ -238,6 +257,7 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
|
|
|
238
257
|
/// Defaults: host = "127.0.0.1", port = 8000
|
|
239
258
|
///
|
|
240
259
|
/// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
|
|
260
|
+
/// Validates plugins at startup to help diagnose configuration issues.
|
|
241
261
|
pub async fn serve_default() -> Result<()> {
|
|
242
262
|
serve("127.0.0.1", 8000).await
|
|
243
263
|
}
|