kreuzberg 4.2.1 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '010412940492f83bc170a4a4efc644ac7e3818502734035523796287837a3893'
4
- data.tar.gz: af24cba007cc58283d678d4b15251ebae3a5740f60ade9a54cc19973a2022a82
3
+ metadata.gz: 1765714785cbe89567dcb13ed0c1e1b79c2da7a2143a0d0b4653c5578a3ada84
4
+ data.tar.gz: 64d6db5e4d88992920f37fe9a3e28ab08b5bdd0b28385da570e8207e67d90f34
5
5
  SHA512:
6
- metadata.gz: ad67348bec54a01ca3592ed72e9b2b8bc9e711a37e11b40ada31466c67f834132fc0de278c53a1c014fa6751da7abebae934cff2a9cc1835f7e056c895a273cb
7
- data.tar.gz: ca2cdb076a5d1af67f0e807978a966d1a391cc286bcdf5499544e3403196140918a54674beab77ea09fc0e8bc7ab66f357da5d984326a511b1d21643a3d6cf41
6
+ metadata.gz: cbb71395a285ddb1a74101fc935ebb8266b81c6172043a128721ab37fad583c7202e559f9e8cb2534bf110721bf20e2d0cbe6838554c772831c56bc09583bf75
7
+ data.tar.gz: b752cf56da8810211e5efd5e5d69f136eb7d0a3d5e27e985b81dff18bde442f0033b15962823ad7e3c5a27e080d02b6a6df1726bffe4aa21eaf89f56a5c6b56f
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.1)
4
+ kreuzberg (4.2.2)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -207,7 +207,7 @@ CHECKSUMS
207
207
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
208
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
209
209
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
210
- kreuzberg (4.2.1)
210
+ kreuzberg (4.2.2)
211
211
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
212
212
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
213
213
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.1" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.2" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -684,13 +684,6 @@ module Kreuzberg
684
684
  # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
685
685
  # config = Extraction.new(image_extraction: image)
686
686
  #
687
- # @example With preprocessing
688
- # preprocessing = Config::ImagePreprocessing.new(
689
- # binarization_method: "sauvola",
690
- # denoise: true
691
- # )
692
- # config = Extraction.new(image_preprocessing: preprocessing)
693
- #
694
687
  # @example With post-processing
695
688
  # postprocessor = Config::PostProcessor.new(
696
689
  # enabled: true,
@@ -708,14 +701,13 @@ module Kreuzberg
708
701
  # language_detection: Config::LanguageDetection.new(enabled: true),
709
702
  # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
710
703
  # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
711
- # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
712
704
  # postprocessor: Config::PostProcessor.new(enabled: true)
713
705
  # )
714
706
  #
715
707
  class Extraction
716
708
  attr_reader :use_cache, :enable_quality_processing, :force_ocr,
717
709
  :ocr, :chunking, :language_detection, :pdf_options,
718
- :images, :image_preprocessing, :postprocessor,
710
+ :images, :postprocessor,
719
711
  :token_reduction, :keywords, :html_options, :pages,
720
712
  :max_concurrent_extractions, :output_format, :result_format
721
713
 
@@ -739,7 +731,7 @@ module Kreuzberg
739
731
  # Keys that are allowed in the Extraction config
740
732
  ALLOWED_KEYS = %i[
741
733
  use_cache enable_quality_processing force_ocr ocr chunking
742
- language_detection pdf_options image_extraction image_preprocessing
734
+ language_detection pdf_options image_extraction
743
735
  postprocessor token_reduction keywords html_options pages
744
736
  max_concurrent_extractions output_format result_format
745
737
  ].freeze
@@ -800,14 +792,13 @@ module Kreuzberg
800
792
 
801
793
  def initialize(hash = nil,
802
794
  use_cache: true,
803
- enable_quality_processing: false,
795
+ enable_quality_processing: true,
804
796
  force_ocr: false,
805
797
  ocr: nil,
806
798
  chunking: nil,
807
799
  language_detection: nil,
808
800
  pdf_options: nil,
809
801
  image_extraction: nil,
810
- image_preprocessing: nil,
811
802
  postprocessor: nil,
812
803
  token_reduction: nil,
813
804
  keywords: nil,
@@ -820,7 +811,7 @@ module Kreuzberg
820
811
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
821
812
  force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
822
813
  pdf_options: pdf_options, image_extraction: image_extraction,
823
- image_preprocessing: image_preprocessing, postprocessor: postprocessor,
814
+ postprocessor: postprocessor,
824
815
  token_reduction: token_reduction, keywords: keywords, html_options: html_options,
825
816
  pages: pages, max_concurrent_extractions: max_concurrent_extractions,
826
817
  output_format: output_format, result_format: result_format
@@ -846,7 +837,6 @@ module Kreuzberg
846
837
  @language_detection = normalize_config(params[:language_detection], LanguageDetection)
847
838
  @pdf_options = normalize_config(params[:pdf_options], PDF)
848
839
  @images = normalize_config(params[:image_extraction], ImageExtraction)
849
- @image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
850
840
  @postprocessor = normalize_config(params[:postprocessor], PostProcessor)
851
841
  @token_reduction = normalize_config(params[:token_reduction], TokenReduction)
852
842
  @keywords = normalize_config(params[:keywords], Keywords)
@@ -878,7 +868,6 @@ module Kreuzberg
878
868
  end
879
869
 
880
870
  # rubocop:disable Metrics/CyclomaticComplexity
881
- # rubocop:disable Metrics/MethodLength
882
871
  def to_h
883
872
  {
884
873
  use_cache: @use_cache,
@@ -889,7 +878,6 @@ module Kreuzberg
889
878
  language_detection: @language_detection&.to_h,
890
879
  pdf_options: @pdf_options&.to_h,
891
880
  images: @images&.to_h,
892
- image_preprocessing: @image_preprocessing&.to_h,
893
881
  postprocessor: @postprocessor&.to_h,
894
882
  token_reduction: @token_reduction&.to_h,
895
883
  keywords: @keywords&.to_h,
@@ -900,7 +888,6 @@ module Kreuzberg
900
888
  result_format: @result_format
901
889
  }.compact
902
890
  end
903
- # rubocop:enable Metrics/MethodLength
904
891
  # rubocop:enable Metrics/CyclomaticComplexity
905
892
 
906
893
  # Serialize configuration to JSON string
@@ -1025,8 +1012,6 @@ module Kreuzberg
1025
1012
  @pdf_options = normalize_config(value, PDF)
1026
1013
  when :image_extraction
1027
1014
  @images = normalize_config(value, ImageExtraction)
1028
- when :image_preprocessing
1029
- @image_preprocessing = normalize_config(value, ImagePreprocessing)
1030
1015
  when :postprocessor
1031
1016
  @postprocessor = normalize_config(value, PostProcessor)
1032
1017
  when :token_reduction
@@ -1101,7 +1086,6 @@ module Kreuzberg
1101
1086
  @language_detection = merged.language_detection
1102
1087
  @pdf_options = merged.pdf_options
1103
1088
  @images = merged.image_extraction
1104
- @image_preprocessing = merged.image_preprocessing
1105
1089
  @postprocessor = merged.postprocessor
1106
1090
  @token_reduction = merged.token_reduction
1107
1091
  @keywords = merged.keywords
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.1'
4
+ VERSION = '4.2.2'
5
5
  end
@@ -309,7 +309,7 @@ RSpec.describe Kreuzberg::Config do
309
309
  config = described_class.new
310
310
 
311
311
  expect(config.use_cache).to be true
312
- expect(config.enable_quality_processing).to be false
312
+ expect(config.enable_quality_processing).to be true
313
313
  expect(config.force_ocr).to be false
314
314
  expect(config.ocr).to be_nil
315
315
  expect(config.chunking).to be_nil
@@ -6,7 +6,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
6
6
  config = described_class.new
7
7
 
8
8
  expect(config.use_cache).to be true
9
- expect(config.enable_quality_processing).to be false
9
+ expect(config.enable_quality_processing).to be true
10
10
  expect(config.force_ocr).to be false
11
11
  expect(config.ocr).to be_nil
12
12
  expect(config.chunking).to be_nil
@@ -103,7 +103,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
103
103
  hash = config.to_h
104
104
 
105
105
  expect(hash[:use_cache]).to be true
106
- expect(hash[:enable_quality_processing]).to be false
106
+ expect(hash[:enable_quality_processing]).to be true
107
107
  expect(hash[:force_ocr]).to be false
108
108
  end
109
109
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.1"
6
+ version = "4.2.2"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.1"
3
+ version = "4.2.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -71,7 +71,7 @@ keywords-yake = ["dep:yake-rust", "stopwords"]
71
71
  keywords-rake = ["dep:rake", "stopwords"]
72
72
  keywords = ["keywords-yake", "keywords-rake"]
73
73
 
74
- api = ["dep:axum", "dep:tower", "dep:tower-http", "tokio-runtime"]
74
+ api = ["dep:axum", "dep:tower", "dep:tower-http", "dep:utoipa", "tokio-runtime"]
75
75
  mcp = ["dep:rmcp", "tokio-runtime"]
76
76
  mcp-http = ["mcp", "api"]
77
77
 
@@ -198,6 +198,7 @@ rake = { version = "0.3.6", optional = true }
198
198
  axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
199
199
  tower = { version = "0.5", optional = true }
200
200
  tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
201
+ utoipa = { version = "5.3", features = ["axum_extras"], optional = true }
201
202
  rmcp = { version = "0.14.0", features = [
202
203
  "server",
203
204
  "macros",
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.1 Release**
20
+ > **🚀 Version 4.2.2 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -2,14 +2,38 @@
2
2
 
3
3
  use axum::{
4
4
  Json,
5
+ extract::{FromRequest, Request, rejection::JsonRejection},
5
6
  http::StatusCode,
6
7
  response::{IntoResponse, Response},
7
8
  };
9
+ use serde::de::DeserializeOwned;
8
10
 
9
11
  use crate::error::KreuzbergError;
10
12
 
11
13
  use super::types::ErrorResponse;
12
14
 
15
+ /// Custom JSON extractor that returns JSON error responses instead of plain text.
16
+ ///
17
+ /// This wraps axum's `Json` extractor but uses `ApiError` as the rejection type,
18
+ /// ensuring that all JSON parsing errors are returned as JSON with proper content type.
19
+ #[derive(Debug, Clone, Copy, Default)]
20
+ pub struct JsonApi<T>(pub T);
21
+
22
+ impl<T, S> FromRequest<S> for JsonApi<T>
23
+ where
24
+ T: DeserializeOwned,
25
+ S: Send + Sync,
26
+ {
27
+ type Rejection = ApiError;
28
+
29
+ async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
30
+ match Json::<T>::from_request(req, state).await {
31
+ Ok(Json(value)) => Ok(JsonApi(value)),
32
+ Err(rejection) => Err(ApiError::from(rejection)),
33
+ }
34
+ }
35
+ }
36
+
13
37
  /// API-specific error wrapper.
14
38
  #[derive(Debug)]
15
39
  pub struct ApiError {
@@ -79,3 +103,39 @@ impl From<KreuzbergError> for ApiError {
79
103
  }
80
104
  }
81
105
  }
106
+
107
+ impl From<JsonRejection> for ApiError {
108
+ fn from(rejection: JsonRejection) -> Self {
109
+ let (status, message) = match rejection {
110
+ JsonRejection::JsonDataError(err) => (
111
+ StatusCode::UNPROCESSABLE_ENTITY,
112
+ format!(
113
+ "Failed to deserialize the JSON body into the target type: {}",
114
+ err.body_text()
115
+ ),
116
+ ),
117
+ JsonRejection::JsonSyntaxError(err) => (
118
+ StatusCode::BAD_REQUEST,
119
+ format!("Failed to parse the request body as JSON: {}", err.body_text()),
120
+ ),
121
+ JsonRejection::MissingJsonContentType(_) => (
122
+ StatusCode::UNSUPPORTED_MEDIA_TYPE,
123
+ "Expected request with `Content-Type: application/json`".to_string(),
124
+ ),
125
+ JsonRejection::BytesRejection(err) => {
126
+ (StatusCode::BAD_REQUEST, format!("Failed to read request body: {}", err))
127
+ }
128
+ _ => (StatusCode::BAD_REQUEST, "Unknown JSON parsing error".to_string()),
129
+ };
130
+
131
+ Self {
132
+ status,
133
+ body: ErrorResponse {
134
+ error_type: "JsonParsingError".to_string(),
135
+ message,
136
+ traceback: None,
137
+ status_code: status.as_u16(),
138
+ },
139
+ }
140
+ }
141
+ }
@@ -8,13 +8,60 @@ use axum::{
8
8
  use crate::{batch_extract_bytes, cache, extract_bytes};
9
9
 
10
10
  use super::{
11
- error::ApiError,
11
+ error::{ApiError, JsonApi},
12
12
  types::{
13
13
  ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest, EmbedResponse,
14
14
  ExtractResponse, HealthResponse, InfoResponse,
15
15
  },
16
16
  };
17
17
 
18
+ /// Health check endpoint handler.
19
+ ///
20
+ /// GET /health
21
+ #[utoipa::path(
22
+ get,
23
+ path = "/health",
24
+ tag = "health",
25
+ responses(
26
+ (status = 200, description = "Service is healthy", body = HealthResponse),
27
+ )
28
+ )]
29
+ #[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
30
+ pub async fn health_handler() -> Json<HealthResponse> {
31
+ // Get plugin status
32
+ let plugin_status = crate::plugins::startup_validation::PluginHealthStatus::check();
33
+
34
+ Json(HealthResponse {
35
+ status: "healthy".to_string(),
36
+ version: env!("CARGO_PKG_VERSION").to_string(),
37
+ plugins: Some(super::types::PluginStatus {
38
+ ocr_backends_count: plugin_status.ocr_backends_count,
39
+ ocr_backends: plugin_status.ocr_backends,
40
+ extractors_count: plugin_status.extractors_count,
41
+ post_processors_count: plugin_status.post_processors_count,
42
+ }),
43
+ })
44
+ }
45
+
46
+ /// Server info endpoint handler.
47
+ ///
48
+ /// GET /info
49
+ #[utoipa::path(
50
+ get,
51
+ path = "/info",
52
+ tag = "health",
53
+ responses(
54
+ (status = 200, description = "Server information", body = InfoResponse),
55
+ )
56
+ )]
57
+ #[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
58
+ pub async fn info_handler() -> Json<InfoResponse> {
59
+ Json(InfoResponse {
60
+ version: env!("CARGO_PKG_VERSION").to_string(),
61
+ rust_backend: true,
62
+ })
63
+ }
64
+
18
65
  /// Extract endpoint handler.
19
66
  ///
20
67
  /// POST /extract
@@ -37,6 +84,19 @@ use super::{
37
84
  ///
38
85
  /// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
39
86
  /// is used as the base, and any per-request config overrides those defaults.
87
+ // TODO: Add utoipa::path annotation once ExtractionResult implements ToSchema
88
+ // #[utoipa::path(
89
+ // post,
90
+ // path = "/extract",
91
+ // tag = "extraction",
92
+ // request_body(content_type = "multipart/form-data"),
93
+ // responses(
94
+ // (status = 200, description = "Extraction successful", body = ExtractResponse),
95
+ // (status = 400, description = "Bad request", body = crate::api::types::ErrorResponse),
96
+ // (status = 413, description = "Payload too large", body = crate::api::types::ErrorResponse),
97
+ // (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
98
+ // )
99
+ // )]
40
100
  #[cfg_attr(
41
101
  feature = "otel",
42
102
  tracing::instrument(
@@ -132,28 +192,6 @@ pub async fn extract_handler(
132
192
  Ok(Json(results))
133
193
  }
134
194
 
135
- /// Health check endpoint handler.
136
- ///
137
- /// GET /health
138
- #[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
139
- pub async fn health_handler() -> Json<HealthResponse> {
140
- Json(HealthResponse {
141
- status: "healthy".to_string(),
142
- version: env!("CARGO_PKG_VERSION").to_string(),
143
- })
144
- }
145
-
146
- /// Server info endpoint handler.
147
- ///
148
- /// GET /info
149
- #[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
150
- pub async fn info_handler() -> Json<InfoResponse> {
151
- Json(InfoResponse {
152
- version: env!("CARGO_PKG_VERSION").to_string(),
153
- rust_backend: true,
154
- })
155
- }
156
-
157
195
  /// Cache stats endpoint handler.
158
196
  ///
159
197
  /// GET /cache/stats
@@ -164,6 +202,15 @@ pub async fn info_handler() -> Json<InfoResponse> {
164
202
  /// - Current directory cannot be determined
165
203
  /// - Cache directory path contains non-UTF8 characters
166
204
  /// - Cache metadata retrieval fails
205
+ #[utoipa::path(
206
+ get,
207
+ path = "/cache/stats",
208
+ tag = "cache",
209
+ responses(
210
+ (status = 200, description = "Cache statistics", body = CacheStatsResponse),
211
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
212
+ )
213
+ )]
167
214
  #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
168
215
  pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
169
216
  let cache_dir = std::env::current_dir()
@@ -204,6 +251,15 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
204
251
  /// - Current directory cannot be determined
205
252
  /// - Cache directory path contains non-UTF8 characters
206
253
  /// - Cache clearing operation fails
254
+ #[utoipa::path(
255
+ delete,
256
+ path = "/cache/clear",
257
+ tag = "cache",
258
+ responses(
259
+ (status = 200, description = "Cache cleared", body = CacheClearResponse),
260
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
261
+ )
262
+ )]
207
263
  #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
208
264
  pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
209
265
  let cache_dir = std::env::current_dir()
@@ -248,6 +304,18 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
248
304
  /// - ONNX Runtime is not available
249
305
  /// - Model initialization fails
250
306
  /// - Embedding generation fails
307
+ #[utoipa::path(
308
+ post,
309
+ path = "/embed",
310
+ tag = "embeddings",
311
+ request_body = EmbedRequest,
312
+ responses(
313
+ (status = 200, description = "Embeddings generated", body = EmbedResponse),
314
+ (status = 400, description = "Bad request - validation failed (e.g., empty texts array)", body = crate::api::types::ErrorResponse),
315
+ (status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
316
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
317
+ )
318
+ )]
251
319
  #[cfg(feature = "embeddings")]
252
320
  #[cfg_attr(
253
321
  feature = "otel",
@@ -260,7 +328,7 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
260
328
  )
261
329
  )
262
330
  )]
263
- pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
331
+ pub async fn embed_handler(JsonApi(request): JsonApi<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
264
332
  use crate::types::{Chunk, ChunkMetadata};
265
333
 
266
334
  if request.texts.is_empty() {
@@ -269,6 +337,13 @@ pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<Emb
269
337
  )));
270
338
  }
271
339
 
340
+ // Validate that no texts are empty
341
+ if request.texts.iter().any(|t| t.is_empty()) {
342
+ return Err(ApiError::validation(crate::error::KreuzbergError::validation(
343
+ "All text entries must be non-empty strings",
344
+ )));
345
+ }
346
+
272
347
  // Use default config if none provided
273
348
  let config = request.config.unwrap_or_default();
274
349
 
@@ -331,8 +406,20 @@ pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<Emb
331
406
  /// Embedding endpoint handler (when embeddings feature is disabled).
332
407
  ///
333
408
  /// Returns an error indicating embeddings feature is not enabled.
409
+ #[utoipa::path(
410
+ post,
411
+ path = "/embed",
412
+ tag = "embeddings",
413
+ request_body = EmbedRequest,
414
+ responses(
415
+ (status = 200, description = "Embeddings generated", body = EmbedResponse),
416
+ (status = 400, description = "Bad request - validation failed (e.g., empty texts array)", body = crate::api::types::ErrorResponse),
417
+ (status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
418
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
419
+ )
420
+ )]
334
421
  #[cfg(not(feature = "embeddings"))]
335
- pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
422
+ pub async fn embed_handler(JsonApi(_request): JsonApi<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
336
423
  Err(ApiError::internal(crate::error::KreuzbergError::MissingDependency(
337
424
  "Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
338
425
  )))
@@ -344,6 +431,18 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
344
431
  ///
345
432
  /// Accepts JSON body with text and optional configuration.
346
433
  /// Returns chunks with metadata.
434
+ #[utoipa::path(
435
+ post,
436
+ path = "/chunk",
437
+ tag = "chunking",
438
+ request_body = ChunkRequest,
439
+ responses(
440
+ (status = 200, description = "Text chunked successfully", body = ChunkResponse),
441
+ (status = 400, description = "Bad request - validation failed (e.g., empty text)", body = crate::api::types::ErrorResponse),
442
+ (status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
443
+ (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
444
+ )
445
+ )]
347
446
  #[cfg_attr(
348
447
  feature = "otel",
349
448
  tracing::instrument(
@@ -352,7 +451,7 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
352
451
  fields(text_length = request.text.len(), chunker_type = request.chunker_type.as_str())
353
452
  )
354
453
  )]
355
- pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<ChunkResponse>, ApiError> {
454
+ pub async fn chunk_handler(JsonApi(request): JsonApi<ChunkRequest>) -> Result<Json<ChunkResponse>, ApiError> {
356
455
  use super::types::{ChunkItem, ChunkingConfigResponse};
357
456
  use crate::chunking::{ChunkerType, ChunkingConfig, chunk_text};
358
457
 
@@ -363,9 +462,9 @@ pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<Chu
363
462
  )));
364
463
  }
365
464
 
366
- // Parse chunker_type
465
+ // Parse chunker_type (empty string is invalid, use default by omitting the field)
367
466
  let chunker_type = match request.chunker_type.to_lowercase().as_str() {
368
- "text" | "" => ChunkerType::Text,
467
+ "text" => ChunkerType::Text,
369
468
  "markdown" => ChunkerType::Markdown,
370
469
  other => {
371
470
  return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
@@ -377,15 +476,37 @@ pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<Chu
377
476
 
378
477
  // Build config with defaults
379
478
  let cfg = request.config.unwrap_or_default();
479
+ let max_characters = cfg.max_characters.unwrap_or(2000);
480
+ let overlap = cfg.overlap.unwrap_or(100);
481
+
482
+ // Validate chunking configuration
483
+ if overlap >= max_characters {
484
+ return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
485
+ "Invalid chunking configuration: overlap ({}) must be less than max_characters ({})",
486
+ overlap, max_characters
487
+ ))));
488
+ }
489
+
380
490
  let config = ChunkingConfig {
381
- max_characters: cfg.max_characters.unwrap_or(2000),
382
- overlap: cfg.overlap.unwrap_or(100),
491
+ max_characters,
492
+ overlap,
383
493
  trim: cfg.trim.unwrap_or(true),
384
494
  chunker_type,
385
495
  };
386
496
 
387
- // Perform chunking
388
- let result = chunk_text(&request.text, &config, None).map_err(ApiError::internal)?;
497
+ // Perform chunking - convert any remaining errors to validation errors since they're likely config issues
498
+ let result = chunk_text(&request.text, &config, None).map_err(|e| {
499
+ // Check if error message indicates a configuration issue
500
+ let msg = e.to_string();
501
+ if msg.contains("configuration") || msg.contains("overlap") || msg.contains("capacity") {
502
+ ApiError::validation(crate::error::KreuzbergError::validation(format!(
503
+ "Invalid chunking configuration: {}",
504
+ msg
505
+ )))
506
+ } else {
507
+ ApiError::internal(e)
508
+ }
509
+ })?;
389
510
 
390
511
  // Transform to response
391
512
  let chunks = result
@@ -87,6 +87,8 @@
87
87
  mod config;
88
88
  mod error;
89
89
  mod handlers;
90
+ #[cfg(feature = "api")]
91
+ pub mod openapi;
90
92
  mod router;
91
93
  mod startup;
92
94
  mod types;