kreuzberg 4.2.1 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/lib/kreuzberg/config.rb +4 -20
- data/lib/kreuzberg/version.rb +1 -1
- data/spec/binding/config_spec.rb +1 -1
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +60 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +11 -5
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +3 -2
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
//! OpenAPI 3.1 schema generation for Kreuzberg API.
|
|
2
|
+
//!
|
|
3
|
+
//! This module generates OpenAPI documentation from Rust types using utoipa.
|
|
4
|
+
//! The schema is available at the `/openapi.json` endpoint.
|
|
5
|
+
|
|
6
|
+
#[cfg(feature = "api")]
|
|
7
|
+
use utoipa::OpenApi;
|
|
8
|
+
|
|
9
|
+
/// OpenAPI documentation structure.
|
|
10
|
+
///
|
|
11
|
+
/// Defines all endpoints, request/response schemas, and examples
|
|
12
|
+
/// for the Kreuzberg document extraction API.
|
|
13
|
+
#[cfg(feature = "api")]
|
|
14
|
+
#[derive(OpenApi)]
|
|
15
|
+
#[openapi(
|
|
16
|
+
info(
|
|
17
|
+
title = "Kreuzberg API",
|
|
18
|
+
version = env!("CARGO_PKG_VERSION"),
|
|
19
|
+
description = "High-performance document intelligence API for extracting text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats.",
|
|
20
|
+
contact(
|
|
21
|
+
name = "Kreuzberg",
|
|
22
|
+
url = "https://kreuzberg.dev"
|
|
23
|
+
),
|
|
24
|
+
license(
|
|
25
|
+
name = "Apache-2.0 OR MIT"
|
|
26
|
+
)
|
|
27
|
+
),
|
|
28
|
+
servers(
|
|
29
|
+
(url = "http://localhost:8000", description = "Local development server"),
|
|
30
|
+
(url = "https://api.kreuzberg.dev", description = "Production server (example)")
|
|
31
|
+
),
|
|
32
|
+
paths(
|
|
33
|
+
crate::api::handlers::health_handler,
|
|
34
|
+
crate::api::handlers::info_handler,
|
|
35
|
+
// Note: extract_handler omitted - requires ExtractionResult ToSchema impl
|
|
36
|
+
crate::api::handlers::cache_stats_handler,
|
|
37
|
+
crate::api::handlers::cache_clear_handler,
|
|
38
|
+
crate::api::handlers::embed_handler,
|
|
39
|
+
crate::api::handlers::chunk_handler,
|
|
40
|
+
),
|
|
41
|
+
components(
|
|
42
|
+
schemas(
|
|
43
|
+
crate::api::types::HealthResponse,
|
|
44
|
+
crate::api::types::PluginStatus,
|
|
45
|
+
crate::api::types::InfoResponse,
|
|
46
|
+
crate::api::types::ErrorResponse,
|
|
47
|
+
crate::api::types::CacheStatsResponse,
|
|
48
|
+
crate::api::types::CacheClearResponse,
|
|
49
|
+
crate::api::types::EmbedRequest,
|
|
50
|
+
crate::api::types::EmbedResponse,
|
|
51
|
+
crate::api::types::ChunkRequest,
|
|
52
|
+
crate::api::types::ChunkResponse,
|
|
53
|
+
crate::api::types::ChunkItem,
|
|
54
|
+
crate::api::types::ChunkingConfigRequest,
|
|
55
|
+
crate::api::types::ChunkingConfigResponse,
|
|
56
|
+
)
|
|
57
|
+
),
|
|
58
|
+
tags(
|
|
59
|
+
(name = "health", description = "Health and status endpoints"),
|
|
60
|
+
(name = "extraction", description = "Document extraction endpoints"),
|
|
61
|
+
(name = "cache", description = "Cache management endpoints"),
|
|
62
|
+
(name = "embeddings", description = "Text embedding generation"),
|
|
63
|
+
(name = "chunking", description = "Text chunking operations")
|
|
64
|
+
)
|
|
65
|
+
)]
|
|
66
|
+
pub struct ApiDoc;
|
|
67
|
+
|
|
68
|
+
/// Generate OpenAPI JSON schema.
|
|
69
|
+
///
|
|
70
|
+
/// Returns the complete OpenAPI 3.1 specification as a JSON string.
|
|
71
|
+
///
|
|
72
|
+
/// # Examples
|
|
73
|
+
///
|
|
74
|
+
/// ```no_run
|
|
75
|
+
/// use kreuzberg::api::openapi::openapi_json;
|
|
76
|
+
///
|
|
77
|
+
/// let schema = openapi_json();
|
|
78
|
+
/// println!("{}", schema);
|
|
79
|
+
/// ```
|
|
80
|
+
#[cfg(feature = "api")]
|
|
81
|
+
pub fn openapi_json() -> String {
|
|
82
|
+
ApiDoc::openapi().to_pretty_json().unwrap_or_else(|_| "{}".to_string())
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
#[cfg(not(feature = "api"))]
|
|
86
|
+
pub fn openapi_json() -> String {
|
|
87
|
+
r#"{"error": "API feature not enabled"}"#.to_string()
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[cfg(test)]
|
|
91
|
+
mod tests {
|
|
92
|
+
#[cfg(feature = "api")]
|
|
93
|
+
use super::*;
|
|
94
|
+
|
|
95
|
+
#[test]
|
|
96
|
+
#[cfg(feature = "api")]
|
|
97
|
+
fn test_openapi_schema_generation() {
|
|
98
|
+
let schema = openapi_json();
|
|
99
|
+
assert!(!schema.is_empty());
|
|
100
|
+
assert!(schema.contains("Kreuzberg API"));
|
|
101
|
+
assert!(schema.contains("/health"));
|
|
102
|
+
assert!(schema.contains("/extract"));
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#[test]
|
|
106
|
+
#[cfg(feature = "api")]
|
|
107
|
+
fn test_openapi_schema_valid_json() {
|
|
108
|
+
let schema = openapi_json();
|
|
109
|
+
let parsed: serde_json::Value = serde_json::from_str(&schema).expect("Invalid JSON");
|
|
110
|
+
assert!(parsed.is_object());
|
|
111
|
+
assert!(parsed["openapi"].is_string());
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
#[test]
|
|
115
|
+
#[cfg(feature = "api")]
|
|
116
|
+
fn test_openapi_includes_all_endpoints() {
|
|
117
|
+
let schema = openapi_json();
|
|
118
|
+
// Health endpoints
|
|
119
|
+
assert!(schema.contains("/health"));
|
|
120
|
+
assert!(schema.contains("/info"));
|
|
121
|
+
// Extraction
|
|
122
|
+
assert!(schema.contains("/extract"));
|
|
123
|
+
// Cache
|
|
124
|
+
assert!(schema.contains("/cache/stats"));
|
|
125
|
+
assert!(schema.contains("/cache/clear"));
|
|
126
|
+
// Embeddings
|
|
127
|
+
assert!(schema.contains("/embed"));
|
|
128
|
+
// Chunking
|
|
129
|
+
assert!(schema.contains("/chunk"));
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
#[test]
|
|
133
|
+
#[cfg(feature = "api")]
|
|
134
|
+
fn test_openapi_includes_schemas() {
|
|
135
|
+
let schema = openapi_json();
|
|
136
|
+
assert!(schema.contains("HealthResponse"));
|
|
137
|
+
assert!(schema.contains("ErrorResponse"));
|
|
138
|
+
assert!(schema.contains("EmbedRequest"));
|
|
139
|
+
assert!(schema.contains("ChunkRequest"));
|
|
140
|
+
}
|
|
141
|
+
}
|
|
@@ -153,14 +153,22 @@ pub fn create_router_with_limits_and_server_config(
|
|
|
153
153
|
}
|
|
154
154
|
};
|
|
155
155
|
|
|
156
|
-
Router::new()
|
|
156
|
+
let mut router = Router::new()
|
|
157
157
|
.route("/extract", post(extract_handler))
|
|
158
158
|
.route("/embed", post(embed_handler))
|
|
159
159
|
.route("/chunk", post(chunk_handler))
|
|
160
160
|
.route("/health", get(health_handler))
|
|
161
161
|
.route("/info", get(info_handler))
|
|
162
162
|
.route("/cache/stats", get(cache_stats_handler))
|
|
163
|
-
.route("/cache/clear", delete(cache_clear_handler))
|
|
163
|
+
.route("/cache/clear", delete(cache_clear_handler));
|
|
164
|
+
|
|
165
|
+
// Add OpenAPI schema endpoint if API feature is enabled
|
|
166
|
+
#[cfg(feature = "api")]
|
|
167
|
+
{
|
|
168
|
+
router = router.route("/openapi.json", get(openapi_schema_handler));
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
router
|
|
164
172
|
.layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
|
|
165
173
|
.layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
|
|
166
174
|
.layer(cors_layer)
|
|
@@ -168,6 +176,20 @@ pub fn create_router_with_limits_and_server_config(
|
|
|
168
176
|
.with_state(state)
|
|
169
177
|
}
|
|
170
178
|
|
|
179
|
+
/// OpenAPI schema handler.
|
|
180
|
+
///
|
|
181
|
+
/// Returns the OpenAPI 3.1 JSON schema for all documented endpoints.
|
|
182
|
+
#[cfg(feature = "api")]
|
|
183
|
+
async fn openapi_schema_handler() -> axum::Json<serde_json::Value> {
|
|
184
|
+
use crate::api::openapi::openapi_json;
|
|
185
|
+
|
|
186
|
+
let schema_str = openapi_json();
|
|
187
|
+
let schema: serde_json::Value = serde_json::from_str(&schema_str)
|
|
188
|
+
.unwrap_or_else(|_| serde_json::json!({"error": "Failed to generate OpenAPI schema"}));
|
|
189
|
+
|
|
190
|
+
axum::Json(schema)
|
|
191
|
+
}
|
|
192
|
+
|
|
171
193
|
#[cfg(test)]
|
|
172
194
|
mod tests {
|
|
173
195
|
use super::*;
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
use std::net::{IpAddr, SocketAddr};
|
|
4
4
|
|
|
5
|
-
use crate::{
|
|
5
|
+
use crate::{
|
|
6
|
+
ExtractionConfig, Result, core::ServerConfig, extractors, plugins::startup_validation::validate_plugins_at_startup,
|
|
7
|
+
};
|
|
6
8
|
|
|
7
9
|
use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
|
|
8
10
|
|
|
@@ -80,7 +82,8 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
|
|
|
80
82
|
server_config.max_multipart_field_bytes,
|
|
81
83
|
);
|
|
82
84
|
|
|
83
|
-
//
|
|
85
|
+
// Initialize extractors and validate plugins at startup
|
|
86
|
+
extractors::ensure_initialized()?;
|
|
84
87
|
validate_plugins_at_startup()?;
|
|
85
88
|
|
|
86
89
|
serve_with_config_and_limits(host, port, extraction_config, limits).await
|
|
@@ -115,7 +118,8 @@ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: Extract
|
|
|
115
118
|
limits.max_request_body_bytes
|
|
116
119
|
);
|
|
117
120
|
|
|
118
|
-
//
|
|
121
|
+
// Initialize extractors and validate plugins at startup
|
|
122
|
+
extractors::ensure_initialized()?;
|
|
119
123
|
validate_plugins_at_startup()?;
|
|
120
124
|
|
|
121
125
|
serve_with_config_and_limits(host, port, config, limits).await
|
|
@@ -165,7 +169,8 @@ pub async fn serve_with_config_and_limits(
|
|
|
165
169
|
let addr = SocketAddr::new(ip, port);
|
|
166
170
|
let app = create_router_with_limits_and_server_config(config, limits, server_config);
|
|
167
171
|
|
|
168
|
-
//
|
|
172
|
+
// Initialize extractors and validate plugins at startup
|
|
173
|
+
extractors::ensure_initialized()?;
|
|
169
174
|
validate_plugins_at_startup()?;
|
|
170
175
|
|
|
171
176
|
tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
|
|
@@ -224,7 +229,8 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
|
|
|
224
229
|
let addr = SocketAddr::new(ip, server_config.port);
|
|
225
230
|
let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
|
|
226
231
|
|
|
227
|
-
//
|
|
232
|
+
// Initialize extractors and validate plugins at startup
|
|
233
|
+
extractors::ensure_initialized()?;
|
|
228
234
|
validate_plugins_at_startup()?;
|
|
229
235
|
|
|
230
236
|
tracing::info!(
|
|
@@ -109,19 +109,41 @@ impl ApiSizeLimits {
|
|
|
109
109
|
}
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
+
/// Plugin status information in health response.
|
|
113
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
114
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
115
|
+
pub struct PluginStatus {
|
|
116
|
+
/// Number of registered OCR backends
|
|
117
|
+
pub ocr_backends_count: usize,
|
|
118
|
+
/// Names of registered OCR backends
|
|
119
|
+
pub ocr_backends: Vec<String>,
|
|
120
|
+
/// Number of registered document extractors
|
|
121
|
+
pub extractors_count: usize,
|
|
122
|
+
/// Number of registered post-processors
|
|
123
|
+
pub post_processors_count: usize,
|
|
124
|
+
}
|
|
125
|
+
|
|
112
126
|
/// Health check response.
|
|
113
127
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
128
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
114
129
|
pub struct HealthResponse {
|
|
115
130
|
/// Health status
|
|
131
|
+
#[cfg_attr(feature = "api", schema(example = "healthy"))]
|
|
116
132
|
pub status: String,
|
|
117
133
|
/// API version
|
|
134
|
+
#[cfg_attr(feature = "api", schema(example = "0.8.0"))]
|
|
118
135
|
pub version: String,
|
|
136
|
+
/// Plugin status (optional)
|
|
137
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
138
|
+
pub plugins: Option<PluginStatus>,
|
|
119
139
|
}
|
|
120
140
|
|
|
121
141
|
/// Server information response.
|
|
122
142
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
143
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
123
144
|
pub struct InfoResponse {
|
|
124
145
|
/// API version
|
|
146
|
+
#[cfg_attr(feature = "api", schema(example = "0.8.0"))]
|
|
125
147
|
pub version: String,
|
|
126
148
|
/// Whether using Rust backend
|
|
127
149
|
pub rust_backend: bool,
|
|
@@ -132,15 +154,19 @@ pub type ExtractResponse = Vec<ExtractionResult>;
|
|
|
132
154
|
|
|
133
155
|
/// Error response.
|
|
134
156
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
157
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
135
158
|
pub struct ErrorResponse {
|
|
136
159
|
/// Error type name
|
|
160
|
+
#[cfg_attr(feature = "api", schema(example = "ValidationError"))]
|
|
137
161
|
pub error_type: String,
|
|
138
162
|
/// Error message
|
|
163
|
+
#[cfg_attr(feature = "api", schema(example = "Invalid input provided"))]
|
|
139
164
|
pub message: String,
|
|
140
165
|
/// Stack trace (if available)
|
|
141
166
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
142
167
|
pub traceback: Option<String>,
|
|
143
168
|
/// HTTP status code
|
|
169
|
+
#[cfg_attr(feature = "api", schema(example = 400))]
|
|
144
170
|
pub status_code: u16,
|
|
145
171
|
}
|
|
146
172
|
|
|
@@ -156,8 +182,10 @@ pub struct ApiState {
|
|
|
156
182
|
|
|
157
183
|
/// Cache statistics response.
|
|
158
184
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
185
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
159
186
|
pub struct CacheStatsResponse {
|
|
160
187
|
/// Cache directory path
|
|
188
|
+
#[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
|
|
161
189
|
pub directory: String,
|
|
162
190
|
/// Total number of cache files
|
|
163
191
|
pub total_files: usize,
|
|
@@ -173,8 +201,10 @@ pub struct CacheStatsResponse {
|
|
|
173
201
|
|
|
174
202
|
/// Cache clear response.
|
|
175
203
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
204
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
176
205
|
pub struct CacheClearResponse {
|
|
177
206
|
/// Cache directory path
|
|
207
|
+
#[cfg_attr(feature = "api", schema(example = "/tmp/kreuzberg-cache"))]
|
|
178
208
|
pub directory: String,
|
|
179
209
|
/// Number of files removed
|
|
180
210
|
pub removed_files: usize,
|
|
@@ -184,20 +214,25 @@ pub struct CacheClearResponse {
|
|
|
184
214
|
|
|
185
215
|
/// Embedding request for generating embeddings from text.
|
|
186
216
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
217
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
187
218
|
pub struct EmbedRequest {
|
|
188
|
-
/// Text strings to generate embeddings for
|
|
219
|
+
/// Text strings to generate embeddings for (at least one non-empty string required)
|
|
220
|
+
#[cfg_attr(feature = "api", schema(min_items = 1))]
|
|
189
221
|
pub texts: Vec<String>,
|
|
190
222
|
/// Optional embedding configuration (model, batch size, etc.)
|
|
191
223
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
224
|
+
#[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
|
|
192
225
|
pub config: Option<crate::core::config::EmbeddingConfig>,
|
|
193
226
|
}
|
|
194
227
|
|
|
195
228
|
/// Embedding response containing generated embeddings.
|
|
196
229
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
230
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
197
231
|
pub struct EmbedResponse {
|
|
198
232
|
/// Generated embeddings (one per input text)
|
|
199
233
|
pub embeddings: Vec<Vec<f32>>,
|
|
200
234
|
/// Model used for embedding generation
|
|
235
|
+
#[cfg_attr(feature = "api", schema(example = "all-MiniLM-L6-v2"))]
|
|
201
236
|
pub model: String,
|
|
202
237
|
/// Dimensionality of the embeddings
|
|
203
238
|
pub dimensions: usize,
|
|
@@ -212,23 +247,29 @@ fn default_chunker_type() -> String {
|
|
|
212
247
|
|
|
213
248
|
/// Chunk request with text and configuration.
|
|
214
249
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
250
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
215
251
|
pub struct ChunkRequest {
|
|
216
|
-
/// Text to chunk
|
|
252
|
+
/// Text to chunk (must not be empty)
|
|
253
|
+
#[cfg_attr(feature = "api", schema(example = "This is sample text to chunk.", min_length = 1))]
|
|
217
254
|
pub text: String,
|
|
218
255
|
/// Optional chunking configuration
|
|
219
256
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
220
257
|
pub config: Option<ChunkingConfigRequest>,
|
|
221
258
|
/// Chunker type (text or markdown)
|
|
222
259
|
#[serde(default = "default_chunker_type")]
|
|
260
|
+
#[cfg_attr(feature = "api", schema(example = "text", pattern = "^(text|markdown)$"))]
|
|
223
261
|
pub chunker_type: String,
|
|
224
262
|
}
|
|
225
263
|
|
|
226
264
|
/// Chunking configuration request.
|
|
227
265
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
|
266
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
228
267
|
pub struct ChunkingConfigRequest {
|
|
229
|
-
/// Maximum characters per chunk
|
|
268
|
+
/// Maximum characters per chunk (must be greater than overlap, default: 2000)
|
|
269
|
+
#[cfg_attr(feature = "api", schema(minimum = 101, example = 2000))]
|
|
230
270
|
pub max_characters: Option<usize>,
|
|
231
|
-
/// Overlap between chunks in characters
|
|
271
|
+
/// Overlap between chunks in characters (must be less than max_characters, default: 100)
|
|
272
|
+
#[cfg_attr(feature = "api", schema(minimum = 0, maximum = 1999, example = 100))]
|
|
232
273
|
pub overlap: Option<usize>,
|
|
233
274
|
/// Whether to trim whitespace
|
|
234
275
|
pub trim: Option<bool>,
|
|
@@ -236,6 +277,7 @@ pub struct ChunkingConfigRequest {
|
|
|
236
277
|
|
|
237
278
|
/// Chunk response with chunks and metadata.
|
|
238
279
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
280
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
239
281
|
pub struct ChunkResponse {
|
|
240
282
|
/// List of chunks
|
|
241
283
|
pub chunks: Vec<ChunkItem>,
|
|
@@ -246,11 +288,13 @@ pub struct ChunkResponse {
|
|
|
246
288
|
/// Input text size in bytes
|
|
247
289
|
pub input_size_bytes: usize,
|
|
248
290
|
/// Chunker type used for chunking
|
|
291
|
+
#[cfg_attr(feature = "api", schema(example = "text"))]
|
|
249
292
|
pub chunker_type: String,
|
|
250
293
|
}
|
|
251
294
|
|
|
252
295
|
/// Individual chunk item with metadata.
|
|
253
296
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
297
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
254
298
|
pub struct ChunkItem {
|
|
255
299
|
/// Chunk content
|
|
256
300
|
pub content: String,
|
|
@@ -272,6 +316,7 @@ pub struct ChunkItem {
|
|
|
272
316
|
|
|
273
317
|
/// Chunking configuration response.
|
|
274
318
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
319
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
275
320
|
pub struct ChunkingConfigResponse {
|
|
276
321
|
/// Maximum characters per chunk
|
|
277
322
|
pub max_characters: usize,
|
|
@@ -280,5 +325,6 @@ pub struct ChunkingConfigResponse {
|
|
|
280
325
|
/// Whether whitespace was trimmed
|
|
281
326
|
pub trim: bool,
|
|
282
327
|
/// Type of chunker used
|
|
328
|
+
#[cfg_attr(feature = "api", schema(example = "text"))]
|
|
283
329
|
pub chunker_type: String,
|
|
284
330
|
}
|
|
@@ -84,7 +84,8 @@ pub struct ChunkingConfig {
|
|
|
84
84
|
/// Requires the `embeddings` feature to be enabled.
|
|
85
85
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
86
86
|
pub struct EmbeddingConfig {
|
|
87
|
-
/// The embedding model to use
|
|
87
|
+
/// The embedding model to use (defaults to "balanced" preset if not specified)
|
|
88
|
+
#[serde(default = "default_model")]
|
|
88
89
|
pub model: EmbeddingModelType,
|
|
89
90
|
|
|
90
91
|
/// Whether to normalize embedding vectors (recommended for cosine similarity)
|
|
@@ -156,6 +157,12 @@ fn default_batch_size() -> usize {
|
|
|
156
157
|
32
|
|
157
158
|
}
|
|
158
159
|
|
|
160
|
+
fn default_model() -> EmbeddingModelType {
|
|
161
|
+
EmbeddingModelType::Preset {
|
|
162
|
+
name: "balanced".to_string(),
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
159
166
|
#[cfg(test)]
|
|
160
167
|
mod tests {
|
|
161
168
|
use super::*;
|