kreuzberg 4.2.1 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/lib/kreuzberg/config.rb +4 -20
- data/lib/kreuzberg/version.rb +1 -1
- data/spec/binding/config_spec.rb +1 -1
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +60 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +11 -5
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1765714785cbe89567dcb13ed0c1e1b79c2da7a2143a0d0b4653c5578a3ada84
|
|
4
|
+
data.tar.gz: 64d6db5e4d88992920f37fe9a3e28ab08b5bdd0b28385da570e8207e67d90f34
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cbb71395a285ddb1a74101fc935ebb8266b81c6172043a128721ab37fad583c7202e559f9e8cb2534bf110721bf20e2d0cbe6838554c772831c56bc09583bf75
|
|
7
|
+
data.tar.gz: b752cf56da8810211e5efd5e5d69f136eb7d0a3d5e27e985b81dff18bde442f0033b15962823ad7e3c5a27e080d02b6a6df1726bffe4aa21eaf89f56a5c6b56f
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.2)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -207,7 +207,7 @@ CHECKSUMS
|
|
|
207
207
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
208
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
209
209
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
210
|
-
kreuzberg (4.2.
|
|
210
|
+
kreuzberg (4.2.2)
|
|
211
211
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
212
212
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
213
213
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -684,13 +684,6 @@ module Kreuzberg
|
|
|
684
684
|
# image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
|
|
685
685
|
# config = Extraction.new(image_extraction: image)
|
|
686
686
|
#
|
|
687
|
-
# @example With preprocessing
|
|
688
|
-
# preprocessing = Config::ImagePreprocessing.new(
|
|
689
|
-
# binarization_method: "sauvola",
|
|
690
|
-
# denoise: true
|
|
691
|
-
# )
|
|
692
|
-
# config = Extraction.new(image_preprocessing: preprocessing)
|
|
693
|
-
#
|
|
694
687
|
# @example With post-processing
|
|
695
688
|
# postprocessor = Config::PostProcessor.new(
|
|
696
689
|
# enabled: true,
|
|
@@ -708,14 +701,13 @@ module Kreuzberg
|
|
|
708
701
|
# language_detection: Config::LanguageDetection.new(enabled: true),
|
|
709
702
|
# pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
|
|
710
703
|
# image_extraction: Config::ImageExtraction.new(target_dpi: 600),
|
|
711
|
-
# image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
|
|
712
704
|
# postprocessor: Config::PostProcessor.new(enabled: true)
|
|
713
705
|
# )
|
|
714
706
|
#
|
|
715
707
|
class Extraction
|
|
716
708
|
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
717
709
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
718
|
-
:images, :
|
|
710
|
+
:images, :postprocessor,
|
|
719
711
|
:token_reduction, :keywords, :html_options, :pages,
|
|
720
712
|
:max_concurrent_extractions, :output_format, :result_format
|
|
721
713
|
|
|
@@ -739,7 +731,7 @@ module Kreuzberg
|
|
|
739
731
|
# Keys that are allowed in the Extraction config
|
|
740
732
|
ALLOWED_KEYS = %i[
|
|
741
733
|
use_cache enable_quality_processing force_ocr ocr chunking
|
|
742
|
-
language_detection pdf_options image_extraction
|
|
734
|
+
language_detection pdf_options image_extraction
|
|
743
735
|
postprocessor token_reduction keywords html_options pages
|
|
744
736
|
max_concurrent_extractions output_format result_format
|
|
745
737
|
].freeze
|
|
@@ -800,14 +792,13 @@ module Kreuzberg
|
|
|
800
792
|
|
|
801
793
|
def initialize(hash = nil,
|
|
802
794
|
use_cache: true,
|
|
803
|
-
enable_quality_processing:
|
|
795
|
+
enable_quality_processing: true,
|
|
804
796
|
force_ocr: false,
|
|
805
797
|
ocr: nil,
|
|
806
798
|
chunking: nil,
|
|
807
799
|
language_detection: nil,
|
|
808
800
|
pdf_options: nil,
|
|
809
801
|
image_extraction: nil,
|
|
810
|
-
image_preprocessing: nil,
|
|
811
802
|
postprocessor: nil,
|
|
812
803
|
token_reduction: nil,
|
|
813
804
|
keywords: nil,
|
|
@@ -820,7 +811,7 @@ module Kreuzberg
|
|
|
820
811
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
821
812
|
force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
822
813
|
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
823
|
-
|
|
814
|
+
postprocessor: postprocessor,
|
|
824
815
|
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
825
816
|
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
826
817
|
output_format: output_format, result_format: result_format
|
|
@@ -846,7 +837,6 @@ module Kreuzberg
|
|
|
846
837
|
@language_detection = normalize_config(params[:language_detection], LanguageDetection)
|
|
847
838
|
@pdf_options = normalize_config(params[:pdf_options], PDF)
|
|
848
839
|
@images = normalize_config(params[:image_extraction], ImageExtraction)
|
|
849
|
-
@image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
|
|
850
840
|
@postprocessor = normalize_config(params[:postprocessor], PostProcessor)
|
|
851
841
|
@token_reduction = normalize_config(params[:token_reduction], TokenReduction)
|
|
852
842
|
@keywords = normalize_config(params[:keywords], Keywords)
|
|
@@ -878,7 +868,6 @@ module Kreuzberg
|
|
|
878
868
|
end
|
|
879
869
|
|
|
880
870
|
# rubocop:disable Metrics/CyclomaticComplexity
|
|
881
|
-
# rubocop:disable Metrics/MethodLength
|
|
882
871
|
def to_h
|
|
883
872
|
{
|
|
884
873
|
use_cache: @use_cache,
|
|
@@ -889,7 +878,6 @@ module Kreuzberg
|
|
|
889
878
|
language_detection: @language_detection&.to_h,
|
|
890
879
|
pdf_options: @pdf_options&.to_h,
|
|
891
880
|
images: @images&.to_h,
|
|
892
|
-
image_preprocessing: @image_preprocessing&.to_h,
|
|
893
881
|
postprocessor: @postprocessor&.to_h,
|
|
894
882
|
token_reduction: @token_reduction&.to_h,
|
|
895
883
|
keywords: @keywords&.to_h,
|
|
@@ -900,7 +888,6 @@ module Kreuzberg
|
|
|
900
888
|
result_format: @result_format
|
|
901
889
|
}.compact
|
|
902
890
|
end
|
|
903
|
-
# rubocop:enable Metrics/MethodLength
|
|
904
891
|
# rubocop:enable Metrics/CyclomaticComplexity
|
|
905
892
|
|
|
906
893
|
# Serialize configuration to JSON string
|
|
@@ -1025,8 +1012,6 @@ module Kreuzberg
|
|
|
1025
1012
|
@pdf_options = normalize_config(value, PDF)
|
|
1026
1013
|
when :image_extraction
|
|
1027
1014
|
@images = normalize_config(value, ImageExtraction)
|
|
1028
|
-
when :image_preprocessing
|
|
1029
|
-
@image_preprocessing = normalize_config(value, ImagePreprocessing)
|
|
1030
1015
|
when :postprocessor
|
|
1031
1016
|
@postprocessor = normalize_config(value, PostProcessor)
|
|
1032
1017
|
when :token_reduction
|
|
@@ -1101,7 +1086,6 @@ module Kreuzberg
|
|
|
1101
1086
|
@language_detection = merged.language_detection
|
|
1102
1087
|
@pdf_options = merged.pdf_options
|
|
1103
1088
|
@images = merged.image_extraction
|
|
1104
|
-
@image_preprocessing = merged.image_preprocessing
|
|
1105
1089
|
@postprocessor = merged.postprocessor
|
|
1106
1090
|
@token_reduction = merged.token_reduction
|
|
1107
1091
|
@keywords = merged.keywords
|
data/lib/kreuzberg/version.rb
CHANGED
data/spec/binding/config_spec.rb
CHANGED
|
@@ -309,7 +309,7 @@ RSpec.describe Kreuzberg::Config do
|
|
|
309
309
|
config = described_class.new
|
|
310
310
|
|
|
311
311
|
expect(config.use_cache).to be true
|
|
312
|
-
expect(config.enable_quality_processing).to be
|
|
312
|
+
expect(config.enable_quality_processing).to be true
|
|
313
313
|
expect(config.force_ocr).to be false
|
|
314
314
|
expect(config.ocr).to be_nil
|
|
315
315
|
expect(config.chunking).to be_nil
|
|
@@ -6,7 +6,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
|
|
|
6
6
|
config = described_class.new
|
|
7
7
|
|
|
8
8
|
expect(config.use_cache).to be true
|
|
9
|
-
expect(config.enable_quality_processing).to be
|
|
9
|
+
expect(config.enable_quality_processing).to be true
|
|
10
10
|
expect(config.force_ocr).to be false
|
|
11
11
|
expect(config.ocr).to be_nil
|
|
12
12
|
expect(config.chunking).to be_nil
|
|
@@ -103,7 +103,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
|
|
|
103
103
|
hash = config.to_h
|
|
104
104
|
|
|
105
105
|
expect(hash[:use_cache]).to be true
|
|
106
|
-
expect(hash[:enable_quality_processing]).to be
|
|
106
|
+
expect(hash[:enable_quality_processing]).to be true
|
|
107
107
|
expect(hash[:force_ocr]).to be false
|
|
108
108
|
end
|
|
109
109
|
end
|
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -71,7 +71,7 @@ keywords-yake = ["dep:yake-rust", "stopwords"]
|
|
|
71
71
|
keywords-rake = ["dep:rake", "stopwords"]
|
|
72
72
|
keywords = ["keywords-yake", "keywords-rake"]
|
|
73
73
|
|
|
74
|
-
api = ["dep:axum", "dep:tower", "dep:tower-http", "tokio-runtime"]
|
|
74
|
+
api = ["dep:axum", "dep:tower", "dep:tower-http", "dep:utoipa", "tokio-runtime"]
|
|
75
75
|
mcp = ["dep:rmcp", "tokio-runtime"]
|
|
76
76
|
mcp-http = ["mcp", "api"]
|
|
77
77
|
|
|
@@ -198,6 +198,7 @@ rake = { version = "0.3.6", optional = true }
|
|
|
198
198
|
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
|
|
199
199
|
tower = { version = "0.5", optional = true }
|
|
200
200
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
|
|
201
|
+
utoipa = { version = "5.3", features = ["axum_extras"], optional = true }
|
|
201
202
|
rmcp = { version = "0.14.0", features = [
|
|
202
203
|
"server",
|
|
203
204
|
"macros",
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.2 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -2,14 +2,38 @@
|
|
|
2
2
|
|
|
3
3
|
use axum::{
|
|
4
4
|
Json,
|
|
5
|
+
extract::{FromRequest, Request, rejection::JsonRejection},
|
|
5
6
|
http::StatusCode,
|
|
6
7
|
response::{IntoResponse, Response},
|
|
7
8
|
};
|
|
9
|
+
use serde::de::DeserializeOwned;
|
|
8
10
|
|
|
9
11
|
use crate::error::KreuzbergError;
|
|
10
12
|
|
|
11
13
|
use super::types::ErrorResponse;
|
|
12
14
|
|
|
15
|
+
/// Custom JSON extractor that returns JSON error responses instead of plain text.
|
|
16
|
+
///
|
|
17
|
+
/// This wraps axum's `Json` extractor but uses `ApiError` as the rejection type,
|
|
18
|
+
/// ensuring that all JSON parsing errors are returned as JSON with proper content type.
|
|
19
|
+
#[derive(Debug, Clone, Copy, Default)]
|
|
20
|
+
pub struct JsonApi<T>(pub T);
|
|
21
|
+
|
|
22
|
+
impl<T, S> FromRequest<S> for JsonApi<T>
|
|
23
|
+
where
|
|
24
|
+
T: DeserializeOwned,
|
|
25
|
+
S: Send + Sync,
|
|
26
|
+
{
|
|
27
|
+
type Rejection = ApiError;
|
|
28
|
+
|
|
29
|
+
async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
|
|
30
|
+
match Json::<T>::from_request(req, state).await {
|
|
31
|
+
Ok(Json(value)) => Ok(JsonApi(value)),
|
|
32
|
+
Err(rejection) => Err(ApiError::from(rejection)),
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
13
37
|
/// API-specific error wrapper.
|
|
14
38
|
#[derive(Debug)]
|
|
15
39
|
pub struct ApiError {
|
|
@@ -79,3 +103,39 @@ impl From<KreuzbergError> for ApiError {
|
|
|
79
103
|
}
|
|
80
104
|
}
|
|
81
105
|
}
|
|
106
|
+
|
|
107
|
+
impl From<JsonRejection> for ApiError {
|
|
108
|
+
fn from(rejection: JsonRejection) -> Self {
|
|
109
|
+
let (status, message) = match rejection {
|
|
110
|
+
JsonRejection::JsonDataError(err) => (
|
|
111
|
+
StatusCode::UNPROCESSABLE_ENTITY,
|
|
112
|
+
format!(
|
|
113
|
+
"Failed to deserialize the JSON body into the target type: {}",
|
|
114
|
+
err.body_text()
|
|
115
|
+
),
|
|
116
|
+
),
|
|
117
|
+
JsonRejection::JsonSyntaxError(err) => (
|
|
118
|
+
StatusCode::BAD_REQUEST,
|
|
119
|
+
format!("Failed to parse the request body as JSON: {}", err.body_text()),
|
|
120
|
+
),
|
|
121
|
+
JsonRejection::MissingJsonContentType(_) => (
|
|
122
|
+
StatusCode::UNSUPPORTED_MEDIA_TYPE,
|
|
123
|
+
"Expected request with `Content-Type: application/json`".to_string(),
|
|
124
|
+
),
|
|
125
|
+
JsonRejection::BytesRejection(err) => {
|
|
126
|
+
(StatusCode::BAD_REQUEST, format!("Failed to read request body: {}", err))
|
|
127
|
+
}
|
|
128
|
+
_ => (StatusCode::BAD_REQUEST, "Unknown JSON parsing error".to_string()),
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
Self {
|
|
132
|
+
status,
|
|
133
|
+
body: ErrorResponse {
|
|
134
|
+
error_type: "JsonParsingError".to_string(),
|
|
135
|
+
message,
|
|
136
|
+
traceback: None,
|
|
137
|
+
status_code: status.as_u16(),
|
|
138
|
+
},
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
@@ -8,13 +8,60 @@ use axum::{
|
|
|
8
8
|
use crate::{batch_extract_bytes, cache, extract_bytes};
|
|
9
9
|
|
|
10
10
|
use super::{
|
|
11
|
-
error::ApiError,
|
|
11
|
+
error::{ApiError, JsonApi},
|
|
12
12
|
types::{
|
|
13
13
|
ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest, EmbedResponse,
|
|
14
14
|
ExtractResponse, HealthResponse, InfoResponse,
|
|
15
15
|
},
|
|
16
16
|
};
|
|
17
17
|
|
|
18
|
+
/// Health check endpoint handler.
|
|
19
|
+
///
|
|
20
|
+
/// GET /health
|
|
21
|
+
#[utoipa::path(
|
|
22
|
+
get,
|
|
23
|
+
path = "/health",
|
|
24
|
+
tag = "health",
|
|
25
|
+
responses(
|
|
26
|
+
(status = 200, description = "Service is healthy", body = HealthResponse),
|
|
27
|
+
)
|
|
28
|
+
)]
|
|
29
|
+
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
|
|
30
|
+
pub async fn health_handler() -> Json<HealthResponse> {
|
|
31
|
+
// Get plugin status
|
|
32
|
+
let plugin_status = crate::plugins::startup_validation::PluginHealthStatus::check();
|
|
33
|
+
|
|
34
|
+
Json(HealthResponse {
|
|
35
|
+
status: "healthy".to_string(),
|
|
36
|
+
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
37
|
+
plugins: Some(super::types::PluginStatus {
|
|
38
|
+
ocr_backends_count: plugin_status.ocr_backends_count,
|
|
39
|
+
ocr_backends: plugin_status.ocr_backends,
|
|
40
|
+
extractors_count: plugin_status.extractors_count,
|
|
41
|
+
post_processors_count: plugin_status.post_processors_count,
|
|
42
|
+
}),
|
|
43
|
+
})
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/// Server info endpoint handler.
|
|
47
|
+
///
|
|
48
|
+
/// GET /info
|
|
49
|
+
#[utoipa::path(
|
|
50
|
+
get,
|
|
51
|
+
path = "/info",
|
|
52
|
+
tag = "health",
|
|
53
|
+
responses(
|
|
54
|
+
(status = 200, description = "Server information", body = InfoResponse),
|
|
55
|
+
)
|
|
56
|
+
)]
|
|
57
|
+
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
|
|
58
|
+
pub async fn info_handler() -> Json<InfoResponse> {
|
|
59
|
+
Json(InfoResponse {
|
|
60
|
+
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
61
|
+
rust_backend: true,
|
|
62
|
+
})
|
|
63
|
+
}
|
|
64
|
+
|
|
18
65
|
/// Extract endpoint handler.
|
|
19
66
|
///
|
|
20
67
|
/// POST /extract
|
|
@@ -37,6 +84,19 @@ use super::{
|
|
|
37
84
|
///
|
|
38
85
|
/// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
|
|
39
86
|
/// is used as the base, and any per-request config overrides those defaults.
|
|
87
|
+
// TODO: Add utoipa::path annotation once ExtractionResult implements ToSchema
|
|
88
|
+
// #[utoipa::path(
|
|
89
|
+
// post,
|
|
90
|
+
// path = "/extract",
|
|
91
|
+
// tag = "extraction",
|
|
92
|
+
// request_body(content_type = "multipart/form-data"),
|
|
93
|
+
// responses(
|
|
94
|
+
// (status = 200, description = "Extraction successful", body = ExtractResponse),
|
|
95
|
+
// (status = 400, description = "Bad request", body = crate::api::types::ErrorResponse),
|
|
96
|
+
// (status = 413, description = "Payload too large", body = crate::api::types::ErrorResponse),
|
|
97
|
+
// (status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
98
|
+
// )
|
|
99
|
+
// )]
|
|
40
100
|
#[cfg_attr(
|
|
41
101
|
feature = "otel",
|
|
42
102
|
tracing::instrument(
|
|
@@ -132,28 +192,6 @@ pub async fn extract_handler(
|
|
|
132
192
|
Ok(Json(results))
|
|
133
193
|
}
|
|
134
194
|
|
|
135
|
-
/// Health check endpoint handler.
|
|
136
|
-
///
|
|
137
|
-
/// GET /health
|
|
138
|
-
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
|
|
139
|
-
pub async fn health_handler() -> Json<HealthResponse> {
|
|
140
|
-
Json(HealthResponse {
|
|
141
|
-
status: "healthy".to_string(),
|
|
142
|
-
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
143
|
-
})
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
/// Server info endpoint handler.
|
|
147
|
-
///
|
|
148
|
-
/// GET /info
|
|
149
|
-
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
|
|
150
|
-
pub async fn info_handler() -> Json<InfoResponse> {
|
|
151
|
-
Json(InfoResponse {
|
|
152
|
-
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
153
|
-
rust_backend: true,
|
|
154
|
-
})
|
|
155
|
-
}
|
|
156
|
-
|
|
157
195
|
/// Cache stats endpoint handler.
|
|
158
196
|
///
|
|
159
197
|
/// GET /cache/stats
|
|
@@ -164,6 +202,15 @@ pub async fn info_handler() -> Json<InfoResponse> {
|
|
|
164
202
|
/// - Current directory cannot be determined
|
|
165
203
|
/// - Cache directory path contains non-UTF8 characters
|
|
166
204
|
/// - Cache metadata retrieval fails
|
|
205
|
+
#[utoipa::path(
|
|
206
|
+
get,
|
|
207
|
+
path = "/cache/stats",
|
|
208
|
+
tag = "cache",
|
|
209
|
+
responses(
|
|
210
|
+
(status = 200, description = "Cache statistics", body = CacheStatsResponse),
|
|
211
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
212
|
+
)
|
|
213
|
+
)]
|
|
167
214
|
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
|
|
168
215
|
pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
|
|
169
216
|
let cache_dir = std::env::current_dir()
|
|
@@ -204,6 +251,15 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
|
|
|
204
251
|
/// - Current directory cannot be determined
|
|
205
252
|
/// - Cache directory path contains non-UTF8 characters
|
|
206
253
|
/// - Cache clearing operation fails
|
|
254
|
+
#[utoipa::path(
|
|
255
|
+
delete,
|
|
256
|
+
path = "/cache/clear",
|
|
257
|
+
tag = "cache",
|
|
258
|
+
responses(
|
|
259
|
+
(status = 200, description = "Cache cleared", body = CacheClearResponse),
|
|
260
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
261
|
+
)
|
|
262
|
+
)]
|
|
207
263
|
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
|
|
208
264
|
pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
|
|
209
265
|
let cache_dir = std::env::current_dir()
|
|
@@ -248,6 +304,18 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
|
|
|
248
304
|
/// - ONNX Runtime is not available
|
|
249
305
|
/// - Model initialization fails
|
|
250
306
|
/// - Embedding generation fails
|
|
307
|
+
#[utoipa::path(
|
|
308
|
+
post,
|
|
309
|
+
path = "/embed",
|
|
310
|
+
tag = "embeddings",
|
|
311
|
+
request_body = EmbedRequest,
|
|
312
|
+
responses(
|
|
313
|
+
(status = 200, description = "Embeddings generated", body = EmbedResponse),
|
|
314
|
+
(status = 400, description = "Bad request - validation failed (e.g., empty texts array)", body = crate::api::types::ErrorResponse),
|
|
315
|
+
(status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
|
|
316
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
317
|
+
)
|
|
318
|
+
)]
|
|
251
319
|
#[cfg(feature = "embeddings")]
|
|
252
320
|
#[cfg_attr(
|
|
253
321
|
feature = "otel",
|
|
@@ -260,7 +328,7 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
|
|
|
260
328
|
)
|
|
261
329
|
)
|
|
262
330
|
)]
|
|
263
|
-
pub async fn embed_handler(
|
|
331
|
+
pub async fn embed_handler(JsonApi(request): JsonApi<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
|
|
264
332
|
use crate::types::{Chunk, ChunkMetadata};
|
|
265
333
|
|
|
266
334
|
if request.texts.is_empty() {
|
|
@@ -269,6 +337,13 @@ pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<Emb
|
|
|
269
337
|
)));
|
|
270
338
|
}
|
|
271
339
|
|
|
340
|
+
// Validate that no texts are empty
|
|
341
|
+
if request.texts.iter().any(|t| t.is_empty()) {
|
|
342
|
+
return Err(ApiError::validation(crate::error::KreuzbergError::validation(
|
|
343
|
+
"All text entries must be non-empty strings",
|
|
344
|
+
)));
|
|
345
|
+
}
|
|
346
|
+
|
|
272
347
|
// Use default config if none provided
|
|
273
348
|
let config = request.config.unwrap_or_default();
|
|
274
349
|
|
|
@@ -331,8 +406,20 @@ pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<Emb
|
|
|
331
406
|
/// Embedding endpoint handler (when embeddings feature is disabled).
|
|
332
407
|
///
|
|
333
408
|
/// Returns an error indicating embeddings feature is not enabled.
|
|
409
|
+
#[utoipa::path(
|
|
410
|
+
post,
|
|
411
|
+
path = "/embed",
|
|
412
|
+
tag = "embeddings",
|
|
413
|
+
request_body = EmbedRequest,
|
|
414
|
+
responses(
|
|
415
|
+
(status = 200, description = "Embeddings generated", body = EmbedResponse),
|
|
416
|
+
(status = 400, description = "Bad request - validation failed (e.g., empty texts array)", body = crate::api::types::ErrorResponse),
|
|
417
|
+
(status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
|
|
418
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
419
|
+
)
|
|
420
|
+
)]
|
|
334
421
|
#[cfg(not(feature = "embeddings"))]
|
|
335
|
-
pub async fn embed_handler(
|
|
422
|
+
pub async fn embed_handler(JsonApi(_request): JsonApi<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
|
|
336
423
|
Err(ApiError::internal(crate::error::KreuzbergError::MissingDependency(
|
|
337
424
|
"Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
|
|
338
425
|
)))
|
|
@@ -344,6 +431,18 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
|
|
|
344
431
|
///
|
|
345
432
|
/// Accepts JSON body with text and optional configuration.
|
|
346
433
|
/// Returns chunks with metadata.
|
|
434
|
+
#[utoipa::path(
|
|
435
|
+
post,
|
|
436
|
+
path = "/chunk",
|
|
437
|
+
tag = "chunking",
|
|
438
|
+
request_body = ChunkRequest,
|
|
439
|
+
responses(
|
|
440
|
+
(status = 200, description = "Text chunked successfully", body = ChunkResponse),
|
|
441
|
+
(status = 400, description = "Bad request - validation failed (e.g., empty text)", body = crate::api::types::ErrorResponse),
|
|
442
|
+
(status = 422, description = "Unprocessable entity - invalid JSON body", body = crate::api::types::ErrorResponse),
|
|
443
|
+
(status = 500, description = "Internal server error", body = crate::api::types::ErrorResponse),
|
|
444
|
+
)
|
|
445
|
+
)]
|
|
347
446
|
#[cfg_attr(
|
|
348
447
|
feature = "otel",
|
|
349
448
|
tracing::instrument(
|
|
@@ -352,7 +451,7 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
|
|
|
352
451
|
fields(text_length = request.text.len(), chunker_type = request.chunker_type.as_str())
|
|
353
452
|
)
|
|
354
453
|
)]
|
|
355
|
-
pub async fn chunk_handler(
|
|
454
|
+
pub async fn chunk_handler(JsonApi(request): JsonApi<ChunkRequest>) -> Result<Json<ChunkResponse>, ApiError> {
|
|
356
455
|
use super::types::{ChunkItem, ChunkingConfigResponse};
|
|
357
456
|
use crate::chunking::{ChunkerType, ChunkingConfig, chunk_text};
|
|
358
457
|
|
|
@@ -363,9 +462,9 @@ pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<Chu
|
|
|
363
462
|
)));
|
|
364
463
|
}
|
|
365
464
|
|
|
366
|
-
// Parse chunker_type
|
|
465
|
+
// Parse chunker_type (empty string is invalid, use default by omitting the field)
|
|
367
466
|
let chunker_type = match request.chunker_type.to_lowercase().as_str() {
|
|
368
|
-
"text"
|
|
467
|
+
"text" => ChunkerType::Text,
|
|
369
468
|
"markdown" => ChunkerType::Markdown,
|
|
370
469
|
other => {
|
|
371
470
|
return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
@@ -377,15 +476,37 @@ pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<Chu
|
|
|
377
476
|
|
|
378
477
|
// Build config with defaults
|
|
379
478
|
let cfg = request.config.unwrap_or_default();
|
|
479
|
+
let max_characters = cfg.max_characters.unwrap_or(2000);
|
|
480
|
+
let overlap = cfg.overlap.unwrap_or(100);
|
|
481
|
+
|
|
482
|
+
// Validate chunking configuration
|
|
483
|
+
if overlap >= max_characters {
|
|
484
|
+
return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
485
|
+
"Invalid chunking configuration: overlap ({}) must be less than max_characters ({})",
|
|
486
|
+
overlap, max_characters
|
|
487
|
+
))));
|
|
488
|
+
}
|
|
489
|
+
|
|
380
490
|
let config = ChunkingConfig {
|
|
381
|
-
max_characters
|
|
382
|
-
overlap
|
|
491
|
+
max_characters,
|
|
492
|
+
overlap,
|
|
383
493
|
trim: cfg.trim.unwrap_or(true),
|
|
384
494
|
chunker_type,
|
|
385
495
|
};
|
|
386
496
|
|
|
387
|
-
// Perform chunking
|
|
388
|
-
let result = chunk_text(&request.text, &config, None).map_err(
|
|
497
|
+
// Perform chunking - convert any remaining errors to validation errors since they're likely config issues
|
|
498
|
+
let result = chunk_text(&request.text, &config, None).map_err(|e| {
|
|
499
|
+
// Check if error message indicates a configuration issue
|
|
500
|
+
let msg = e.to_string();
|
|
501
|
+
if msg.contains("configuration") || msg.contains("overlap") || msg.contains("capacity") {
|
|
502
|
+
ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
503
|
+
"Invalid chunking configuration: {}",
|
|
504
|
+
msg
|
|
505
|
+
)))
|
|
506
|
+
} else {
|
|
507
|
+
ApiError::internal(e)
|
|
508
|
+
}
|
|
509
|
+
})?;
|
|
389
510
|
|
|
390
511
|
// Transform to response
|
|
391
512
|
let chunks = result
|