kreuzberg 4.0.0.pre.rc.27 → 4.0.0.pre.rc.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -6
- data/ext/kreuzberg_rb/native/Cargo.lock +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +7 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +5 -5
- data/vendor/kreuzberg/Cargo.toml +33 -13
- data/vendor/kreuzberg/src/api/handlers.rs +126 -1
- data/vendor/kreuzberg/src/api/mod.rs +10 -3
- data/vendor/kreuzberg/src/api/server.rs +205 -197
- data/vendor/kreuzberg/src/api/types.rs +23 -0
- data/vendor/kreuzberg/src/core/config.rs +561 -0
- data/vendor/kreuzberg/src/core/config_validation.rs +295 -0
- data/vendor/kreuzberg/src/core/mod.rs +2 -0
- data/vendor/kreuzberg/src/core/server_config.rs +1044 -0
- data/vendor/kreuzberg/src/embeddings.rs +21 -49
- data/vendor/kreuzberg/src/extractors/docx.rs +1 -1
- data/vendor/kreuzberg/src/extractors/pdf.rs +29 -30
- data/vendor/kreuzberg/src/extractors/pptx.rs +30 -18
- data/vendor/kreuzberg/src/lib.rs +3 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -40
- data/vendor/kreuzberg/src/ocr/processor.rs +1 -6
- data/vendor/kreuzberg/src/pdf/metadata.rs +7 -0
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +1 -2
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +14 -3
- data/vendor/kreuzberg-tesseract/build.rs +1 -0
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3516f4c7752e59f020d36750e06b493b7bf1670d1e5ca56423324cf90c48e5f8
|
|
4
|
+
data.tar.gz: 6fe9f382e6741eaa69c28ffc70647f6b0cccebaa7072b23f8811d0072843fc09
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 53b28f45c8831830c269580c73f0480afddf3550ff0fa248a6155b0478f022696e7025f20699fc36be358bd04e3a0e5a279259c3960ee6ed9d0218464897d928
|
|
7
|
+
data.tar.gz: 32b50779cf5b0ed5aaab7804f522297f9b12fe3b9374dc8efadeec2ec78aada25d1372dc50ffe81baccddfcca271b1345d09cb41e239498c7d061d0a31982979
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.0.0.pre.rc.
|
|
4
|
+
kreuzberg (4.0.0.pre.rc.28)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -76,7 +76,7 @@ GEM
|
|
|
76
76
|
ffi (~> 1.0)
|
|
77
77
|
rb_sys (0.9.119)
|
|
78
78
|
rake-compiler-dock (= 1.10.0)
|
|
79
|
-
rbs (3.10.
|
|
79
|
+
rbs (3.10.1)
|
|
80
80
|
logger
|
|
81
81
|
regexp_parser (2.11.3)
|
|
82
82
|
rspec (3.13.2)
|
|
@@ -132,7 +132,7 @@ GEM
|
|
|
132
132
|
strscan (>= 1.0.0)
|
|
133
133
|
terminal-table (>= 2, < 5)
|
|
134
134
|
uri (>= 0.12.0)
|
|
135
|
-
strscan (3.1.
|
|
135
|
+
strscan (3.1.7)
|
|
136
136
|
terminal-table (4.0.0)
|
|
137
137
|
unicode-display_width (>= 1.1.1, < 4)
|
|
138
138
|
tzinfo (2.0.6)
|
|
@@ -198,7 +198,7 @@ CHECKSUMS
|
|
|
198
198
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
199
199
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
200
200
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
201
|
-
kreuzberg (4.0.0.pre.rc.
|
|
201
|
+
kreuzberg (4.0.0.pre.rc.28)
|
|
202
202
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
203
203
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
204
204
|
listen (3.9.0) sha256=db9e4424e0e5834480385197c139cb6b0ae0ef28cc13310cfd1ca78377d59c67
|
|
@@ -219,7 +219,7 @@ CHECKSUMS
|
|
|
219
219
|
rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
|
|
220
220
|
rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
|
|
221
221
|
rb_sys (0.9.119) sha256=64393fa148e402e1b79b64496d2aabfc7df79da6b822b8bb48dc1141eaf40b4b
|
|
222
|
-
rbs (3.10.
|
|
222
|
+
rbs (3.10.1) sha256=4e0a9e460dd2b0b763be24734b113da32fc621d383c1119005fe7fb18c73d0c9
|
|
223
223
|
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
224
224
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
225
225
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
@@ -233,7 +233,7 @@ CHECKSUMS
|
|
|
233
233
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
234
234
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
235
235
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
236
|
-
strscan (3.1.
|
|
236
|
+
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
237
237
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
|
238
238
|
tzinfo (2.0.6) sha256=8daf828cc77bcf7d63b0e3bdb6caa47e2272dcfaf4fbfe46f8c3a9df087a829b
|
|
239
239
|
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
[workspace]
|
|
2
2
|
|
|
3
|
+
[workspace.lints.clippy]
|
|
4
|
+
collapsible_if = "allow"
|
|
5
|
+
|
|
3
6
|
[package]
|
|
4
7
|
name = "kreuzberg-rb"
|
|
5
|
-
version = "4.0.0-rc.
|
|
8
|
+
version = "4.0.0-rc.28"
|
|
6
9
|
edition = "2024"
|
|
7
10
|
rust-version = "1.91"
|
|
8
11
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -15,6 +18,9 @@ description = "Ruby bindings (Magnus) for Kreuzberg - high-performance document
|
|
|
15
18
|
keywords = ["ruby", "magnus", "document", "extraction", "bindings"]
|
|
16
19
|
categories = ["api-bindings", "text-processing"]
|
|
17
20
|
|
|
21
|
+
[lints]
|
|
22
|
+
workspace = true
|
|
23
|
+
|
|
18
24
|
[lib]
|
|
19
25
|
name = "kreuzberg_rb"
|
|
20
26
|
crate-type = ["cdylib", "rlib"]
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "4.0.0-rc.
|
|
6
|
+
version = "4.0.0-rc.28"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.91"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -26,7 +26,7 @@ tokio = { version = "1.49.0", features = [
|
|
|
26
26
|
|
|
27
27
|
# Serialization
|
|
28
28
|
serde = { version = "1.0.228", features = ["derive"] }
|
|
29
|
-
serde_json = "1.0.
|
|
29
|
+
serde_json = "1.0.149"
|
|
30
30
|
|
|
31
31
|
# Error handling
|
|
32
32
|
thiserror = "2.0.17"
|
|
@@ -47,10 +47,10 @@ hex = "0.4.3"
|
|
|
47
47
|
toml = "0.9.10"
|
|
48
48
|
num_cpus = "1.17.0"
|
|
49
49
|
once_cell = "1.21.3"
|
|
50
|
-
html-to-markdown-rs = { version = "2.
|
|
51
|
-
reqwest = { version = "0.
|
|
50
|
+
html-to-markdown-rs = { version = "2.20.0", default-features = false }
|
|
51
|
+
reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
|
|
52
52
|
image = { version = "0.25.9", default-features = false }
|
|
53
|
-
lzma-rust2 = { version = "0.15.
|
|
53
|
+
lzma-rust2 = { version = "0.15.6" }
|
|
54
54
|
|
|
55
55
|
# Testing (dev)
|
|
56
56
|
tempfile = "3.24.0"
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.0.0-rc.
|
|
3
|
+
version = "4.0.0-rc.28"
|
|
4
4
|
edition = "2024"
|
|
5
|
-
lints.workspace = true
|
|
6
5
|
rust-version = "1.91"
|
|
7
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
8
7
|
description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats with async/sync APIs."
|
|
@@ -145,10 +144,6 @@ tokio = { workspace = true, optional = true }
|
|
|
145
144
|
uuid = { version = "1.19.0", features = ["v4", "js"] }
|
|
146
145
|
indexmap = "2.12.1"
|
|
147
146
|
tracing = { workspace = true }
|
|
148
|
-
reqwest = { workspace = true, default-features = false, features = [
|
|
149
|
-
"json",
|
|
150
|
-
"rustls-tls",
|
|
151
|
-
], optional = true }
|
|
152
147
|
pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", features = [
|
|
153
148
|
"thread_safe",
|
|
154
149
|
"image_latest",
|
|
@@ -188,16 +183,12 @@ image = { workspace = true, default-features = false, features = [
|
|
|
188
183
|
"gif",
|
|
189
184
|
"rayon",
|
|
190
185
|
], optional = true }
|
|
191
|
-
tiff = { version = "0.
|
|
186
|
+
tiff = { version = "0.11", optional = true }
|
|
192
187
|
fast_image_resize = { version = "5.5.0", optional = true }
|
|
193
188
|
ndarray = { version = "0.17.1", optional = true }
|
|
194
189
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
195
190
|
whatlang = { version = "0.18.0", optional = true }
|
|
196
191
|
text-splitter = { version = "0.29.3", features = ["markdown"], optional = true }
|
|
197
|
-
fastembed = { version = "5.6", default-features = false, features = [
|
|
198
|
-
"hf-hub-rustls-tls",
|
|
199
|
-
"ort-load-dynamic",
|
|
200
|
-
], optional = true }
|
|
201
192
|
unicode-normalization = { version = "0.1.25", optional = true }
|
|
202
193
|
chardetng = { version = "0.1.17", optional = true }
|
|
203
194
|
encoding_rs = { version = "0.8.35", optional = true }
|
|
@@ -225,15 +216,44 @@ tempfile = { workspace = true }
|
|
|
225
216
|
filetime = "0.2"
|
|
226
217
|
tar = "0.4.44"
|
|
227
218
|
zip = "7.0.0"
|
|
228
|
-
serial_test = "3.
|
|
219
|
+
serial_test = "3.3.1"
|
|
229
220
|
anyhow = { workspace = true }
|
|
230
221
|
tokio-test = "0.4"
|
|
231
222
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
232
223
|
criterion = { workspace = true }
|
|
233
224
|
image = { workspace = true, default-features = false, features = ["png"] }
|
|
234
225
|
|
|
235
|
-
[target.'cfg(not(target_os = "windows"))'.dependencies]
|
|
226
|
+
[target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dependencies]
|
|
236
227
|
pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
|
|
228
|
+
# Use rustls on non-Windows platforms (Linux, macOS)
|
|
229
|
+
reqwest = { workspace = true, default-features = false, features = [
|
|
230
|
+
"json",
|
|
231
|
+
"rustls",
|
|
232
|
+
], optional = true }
|
|
233
|
+
# Use rustls-tls for fastembed on non-Windows platforms
|
|
234
|
+
fastembed = { version = "5.6", default-features = false, features = [
|
|
235
|
+
"hf-hub-rustls-tls",
|
|
236
|
+
"ort-load-dynamic",
|
|
237
|
+
], optional = true }
|
|
238
|
+
# Force ureq (transitive dep via hf-hub) to use rustls on non-Windows
|
|
239
|
+
ureq = { version = "2.12", default-features = false, features = ["tls", "json"] }
|
|
240
|
+
|
|
241
|
+
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
|
242
|
+
[target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies]
|
|
243
|
+
reqwest = { workspace = true, default-features = false, features = [
|
|
244
|
+
"json",
|
|
245
|
+
"native-tls",
|
|
246
|
+
], optional = true }
|
|
247
|
+
# Use native-tls for fastembed on Windows
|
|
248
|
+
fastembed = { version = "5.6", default-features = false, features = [
|
|
249
|
+
"hf-hub-native-tls",
|
|
250
|
+
"ort-load-dynamic",
|
|
251
|
+
], optional = true }
|
|
252
|
+
# Force ureq (transitive dep via hf-hub) to use native-tls on Windows
|
|
253
|
+
ureq = { version = "2.12", default-features = false, features = ["native-tls", "json"] }
|
|
237
254
|
|
|
238
255
|
[target.'cfg(target_arch = "wasm32")'.dependencies]
|
|
239
256
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
|
257
|
+
# Override getrandom to enable js feature for WASM targets
|
|
258
|
+
# This is needed because ring/rustls (via ureq) depend on getrandom without js feature
|
|
259
|
+
getrandom = { workspace = true }
|
|
@@ -9,7 +9,10 @@ use crate::{batch_extract_bytes, cache, extract_bytes};
|
|
|
9
9
|
|
|
10
10
|
use super::{
|
|
11
11
|
error::ApiError,
|
|
12
|
-
types::{
|
|
12
|
+
types::{
|
|
13
|
+
ApiState, CacheClearResponse, CacheStatsResponse, EmbedRequest, EmbedResponse, ExtractResponse, HealthResponse,
|
|
14
|
+
InfoResponse,
|
|
15
|
+
},
|
|
13
16
|
};
|
|
14
17
|
|
|
15
18
|
/// Extract endpoint handler.
|
|
@@ -34,6 +37,14 @@ use super::{
|
|
|
34
37
|
///
|
|
35
38
|
/// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
|
|
36
39
|
/// is used as the base, and any per-request config overrides those defaults.
|
|
40
|
+
#[cfg_attr(
|
|
41
|
+
feature = "otel",
|
|
42
|
+
tracing::instrument(
|
|
43
|
+
name = "api.extract",
|
|
44
|
+
skip(state, multipart),
|
|
45
|
+
fields(files_count = tracing::field::Empty)
|
|
46
|
+
)
|
|
47
|
+
)]
|
|
37
48
|
pub async fn extract_handler(
|
|
38
49
|
State(state): State<ApiState>,
|
|
39
50
|
mut multipart: Multipart,
|
|
@@ -84,6 +95,9 @@ pub async fn extract_handler(
|
|
|
84
95
|
)));
|
|
85
96
|
}
|
|
86
97
|
|
|
98
|
+
#[cfg(feature = "otel")]
|
|
99
|
+
tracing::Span::current().record("files_count", files.len());
|
|
100
|
+
|
|
87
101
|
if files.len() == 1 {
|
|
88
102
|
let (data, mime_type, _file_name) = files
|
|
89
103
|
.into_iter()
|
|
@@ -102,6 +116,7 @@ pub async fn extract_handler(
|
|
|
102
116
|
/// Health check endpoint handler.
|
|
103
117
|
///
|
|
104
118
|
/// GET /health
|
|
119
|
+
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
|
|
105
120
|
pub async fn health_handler() -> Json<HealthResponse> {
|
|
106
121
|
Json(HealthResponse {
|
|
107
122
|
status: "healthy".to_string(),
|
|
@@ -112,6 +127,7 @@ pub async fn health_handler() -> Json<HealthResponse> {
|
|
|
112
127
|
/// Server info endpoint handler.
|
|
113
128
|
///
|
|
114
129
|
/// GET /info
|
|
130
|
+
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
|
|
115
131
|
pub async fn info_handler() -> Json<InfoResponse> {
|
|
116
132
|
Json(InfoResponse {
|
|
117
133
|
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
@@ -129,6 +145,7 @@ pub async fn info_handler() -> Json<InfoResponse> {
|
|
|
129
145
|
/// - Current directory cannot be determined
|
|
130
146
|
/// - Cache directory path contains non-UTF8 characters
|
|
131
147
|
/// - Cache metadata retrieval fails
|
|
148
|
+
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
|
|
132
149
|
pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
|
|
133
150
|
let cache_dir = std::env::current_dir()
|
|
134
151
|
.map_err(|e| {
|
|
@@ -168,6 +185,7 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
|
|
|
168
185
|
/// - Current directory cannot be determined
|
|
169
186
|
/// - Cache directory path contains non-UTF8 characters
|
|
170
187
|
/// - Cache clearing operation fails
|
|
188
|
+
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
|
|
171
189
|
pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
|
|
172
190
|
let cache_dir = std::env::current_dir()
|
|
173
191
|
.map_err(|e| {
|
|
@@ -193,3 +211,110 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
|
|
|
193
211
|
freed_mb,
|
|
194
212
|
}))
|
|
195
213
|
}
|
|
214
|
+
|
|
215
|
+
/// Embedding endpoint handler.
|
|
216
|
+
///
|
|
217
|
+
/// POST /embed
|
|
218
|
+
///
|
|
219
|
+
/// Accepts JSON body with:
|
|
220
|
+
/// - `texts`: Array of strings to generate embeddings for
|
|
221
|
+
/// - `config` (optional): Embedding configuration (model, batch size, cache_dir)
|
|
222
|
+
///
|
|
223
|
+
/// Returns embeddings for each input text.
|
|
224
|
+
///
|
|
225
|
+
/// # Errors
|
|
226
|
+
///
|
|
227
|
+
/// Returns `ApiError::Internal` if:
|
|
228
|
+
/// - Embeddings feature is not enabled
|
|
229
|
+
/// - ONNX Runtime is not available
|
|
230
|
+
/// - Model initialization fails
|
|
231
|
+
/// - Embedding generation fails
|
|
232
|
+
#[cfg(feature = "embeddings")]
|
|
233
|
+
#[cfg_attr(
|
|
234
|
+
feature = "otel",
|
|
235
|
+
tracing::instrument(
|
|
236
|
+
name = "api.embed",
|
|
237
|
+
skip(request),
|
|
238
|
+
fields(
|
|
239
|
+
texts_count = request.texts.len(),
|
|
240
|
+
model = tracing::field::Empty
|
|
241
|
+
)
|
|
242
|
+
)
|
|
243
|
+
)]
|
|
244
|
+
pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
|
|
245
|
+
use crate::types::{Chunk, ChunkMetadata};
|
|
246
|
+
|
|
247
|
+
if request.texts.is_empty() {
|
|
248
|
+
return Err(ApiError::validation(crate::error::KreuzbergError::validation(
|
|
249
|
+
"No texts provided for embedding generation",
|
|
250
|
+
)));
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Use default config if none provided
|
|
254
|
+
let config = request.config.unwrap_or_default();
|
|
255
|
+
|
|
256
|
+
// Create chunks from input texts
|
|
257
|
+
let mut chunks: Vec<Chunk> = request
|
|
258
|
+
.texts
|
|
259
|
+
.iter()
|
|
260
|
+
.enumerate()
|
|
261
|
+
.map(|(idx, text)| Chunk {
|
|
262
|
+
content: text.clone(),
|
|
263
|
+
embedding: None,
|
|
264
|
+
metadata: ChunkMetadata {
|
|
265
|
+
byte_start: 0,
|
|
266
|
+
byte_end: text.len(),
|
|
267
|
+
token_count: None,
|
|
268
|
+
chunk_index: idx,
|
|
269
|
+
total_chunks: request.texts.len(),
|
|
270
|
+
first_page: None,
|
|
271
|
+
last_page: None,
|
|
272
|
+
},
|
|
273
|
+
})
|
|
274
|
+
.collect();
|
|
275
|
+
|
|
276
|
+
// Generate embeddings
|
|
277
|
+
crate::embeddings::generate_embeddings_for_chunks(&mut chunks, &config).map_err(ApiError::internal)?;
|
|
278
|
+
|
|
279
|
+
// Extract embeddings from chunks
|
|
280
|
+
let embeddings: Vec<Vec<f32>> = chunks
|
|
281
|
+
.into_iter()
|
|
282
|
+
.map(|chunk| {
|
|
283
|
+
chunk.embedding.ok_or_else(|| {
|
|
284
|
+
ApiError::internal(crate::error::KreuzbergError::Other(
|
|
285
|
+
"Failed to generate embedding for text".to_string(),
|
|
286
|
+
))
|
|
287
|
+
})
|
|
288
|
+
})
|
|
289
|
+
.collect::<Result<Vec<_>, _>>()?;
|
|
290
|
+
|
|
291
|
+
let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
|
|
292
|
+
|
|
293
|
+
// Get model name from config
|
|
294
|
+
let model_name = match &config.model {
|
|
295
|
+
crate::core::config::EmbeddingModelType::Preset { name } => name.clone(),
|
|
296
|
+
#[cfg(feature = "embeddings")]
|
|
297
|
+
crate::core::config::EmbeddingModelType::FastEmbed { model, .. } => model.clone(),
|
|
298
|
+
crate::core::config::EmbeddingModelType::Custom { .. } => "custom".to_string(),
|
|
299
|
+
};
|
|
300
|
+
|
|
301
|
+
#[cfg(feature = "otel")]
|
|
302
|
+
tracing::Span::current().record("model", &model_name);
|
|
303
|
+
|
|
304
|
+
Ok(Json(EmbedResponse {
|
|
305
|
+
embeddings,
|
|
306
|
+
model: model_name,
|
|
307
|
+
dimensions,
|
|
308
|
+
count: request.texts.len(),
|
|
309
|
+
}))
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
/// Embedding endpoint handler (when embeddings feature is disabled).
|
|
313
|
+
///
|
|
314
|
+
/// Returns an error indicating embeddings feature is not enabled.
|
|
315
|
+
#[cfg(not(feature = "embeddings"))]
|
|
316
|
+
pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
|
|
317
|
+
Err(ApiError::internal(crate::error::KreuzbergError::MissingDependency(
|
|
318
|
+
"Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
|
|
319
|
+
)))
|
|
320
|
+
}
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
//! # Endpoints
|
|
7
7
|
//!
|
|
8
8
|
//! - `POST /extract` - Extract text from uploaded files (multipart form data)
|
|
9
|
+
//! - `POST /embed` - Generate embeddings for text (JSON body with texts array)
|
|
9
10
|
//! - `GET /health` - Health check endpoint
|
|
10
11
|
//! - `GET /info` - Server information
|
|
11
12
|
//! - `GET /cache/stats` - Get cache statistics
|
|
@@ -70,6 +71,11 @@
|
|
|
70
71
|
//!
|
|
71
72
|
//! # Clear cache
|
|
72
73
|
//! curl -X DELETE http://localhost:8000/cache/clear
|
|
74
|
+
//!
|
|
75
|
+
//! # Generate embeddings
|
|
76
|
+
//! curl -X POST http://localhost:8000/embed \
|
|
77
|
+
//! -H "Content-Type: application/json" \
|
|
78
|
+
//! -d '{"texts":["Hello world","Second text"]}'
|
|
73
79
|
//! ```
|
|
74
80
|
|
|
75
81
|
mod error;
|
|
@@ -79,9 +85,10 @@ mod types;
|
|
|
79
85
|
|
|
80
86
|
pub use error::ApiError;
|
|
81
87
|
pub use server::{
|
|
82
|
-
create_router, create_router_with_limits,
|
|
88
|
+
create_router, create_router_with_limits, create_router_with_limits_and_server_config, load_server_config, serve,
|
|
89
|
+
serve_default, serve_with_config, serve_with_config_and_limits, serve_with_server_config,
|
|
83
90
|
};
|
|
84
91
|
pub use types::{
|
|
85
|
-
ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse,
|
|
86
|
-
InfoResponse,
|
|
92
|
+
ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, EmbedRequest, EmbedResponse, ErrorResponse,
|
|
93
|
+
ExtractResponse, HealthResponse, InfoResponse,
|
|
87
94
|
};
|