kreuzberg 4.0.0.pre.rc.27 → 4.0.0.pre.rc.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: db85e33f6712e0a72c327af0e5c91e24f5c937fd97cb198ed566fa3e8779b00f
4
- data.tar.gz: 85f37cdec92c6f410a39d4d05f7537d013ff14a5d5d692911fd9e7d051e82139
3
+ metadata.gz: 3516f4c7752e59f020d36750e06b493b7bf1670d1e5ca56423324cf90c48e5f8
4
+ data.tar.gz: 6fe9f382e6741eaa69c28ffc70647f6b0cccebaa7072b23f8811d0072843fc09
5
5
  SHA512:
6
- metadata.gz: 4840194f9a38ac7df13ae3f847daea0d2b521ee5fc2f43ae7405ebdeeb972d8e0fd112b49ebedc7d0188b9aaf5ad63b9a00a16fd38a5ca5865d550a2ee1fd60f
7
- data.tar.gz: b63c74e8955c6d70cff05eb4f466b878eb64681cd59ce7077dc834f5e07fa0098dca64d9f51851d7fc7bb00c64de6cf748f897d577a3a3772fbc338803d6606e
6
+ metadata.gz: 53b28f45c8831830c269580c73f0480afddf3550ff0fa248a6155b0478f022696e7025f20699fc36be358bd04e3a0e5a279259c3960ee6ed9d0218464897d928
7
+ data.tar.gz: 32b50779cf5b0ed5aaab7804f522297f9b12fe3b9374dc8efadeec2ec78aada25d1372dc50ffe81baccddfcca271b1345d09cb41e239498c7d061d0a31982979
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.27)
4
+ kreuzberg (4.0.0.pre.rc.28)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -76,7 +76,7 @@ GEM
76
76
  ffi (~> 1.0)
77
77
  rb_sys (0.9.119)
78
78
  rake-compiler-dock (= 1.10.0)
79
- rbs (3.10.0)
79
+ rbs (3.10.1)
80
80
  logger
81
81
  regexp_parser (2.11.3)
82
82
  rspec (3.13.2)
@@ -132,7 +132,7 @@ GEM
132
132
  strscan (>= 1.0.0)
133
133
  terminal-table (>= 2, < 5)
134
134
  uri (>= 0.12.0)
135
- strscan (3.1.6)
135
+ strscan (3.1.7)
136
136
  terminal-table (4.0.0)
137
137
  unicode-display_width (>= 1.1.1, < 4)
138
138
  tzinfo (2.0.6)
@@ -198,7 +198,7 @@ CHECKSUMS
198
198
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
199
199
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
200
200
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
201
- kreuzberg (4.0.0.pre.rc.27)
201
+ kreuzberg (4.0.0.pre.rc.28)
202
202
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
203
203
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
204
204
  listen (3.9.0) sha256=db9e4424e0e5834480385197c139cb6b0ae0ef28cc13310cfd1ca78377d59c67
@@ -219,7 +219,7 @@ CHECKSUMS
219
219
  rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
220
220
  rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
221
221
  rb_sys (0.9.119) sha256=64393fa148e402e1b79b64496d2aabfc7df79da6b822b8bb48dc1141eaf40b4b
222
- rbs (3.10.0) sha256=e75b5f1313c71c9ee0fcea68bf97d3e5fe8ec7a641d4b5cd18bbc28c94ddf298
222
+ rbs (3.10.1) sha256=4e0a9e460dd2b0b763be24734b113da32fc621d383c1119005fe7fb18c73d0c9
223
223
  regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
224
224
  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
225
225
  rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
@@ -233,7 +233,7 @@ CHECKSUMS
233
233
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
234
234
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
235
235
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
236
- strscan (3.1.6) sha256=ebd56df0b0468b00a1f2004b4078c34df58c3506b2bba939e7531892aece81f3
236
+ strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
237
237
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
238
238
  tzinfo (2.0.6) sha256=8daf828cc77bcf7d63b0e3bdb6caa47e2272dcfaf4fbfe46f8c3a9df087a829b
239
239
  unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
@@ -2578,7 +2578,7 @@ dependencies = [
2578
2578
 
2579
2579
  [[package]]
2580
2580
  name = "kreuzberg-rb"
2581
- version = "4.0.0-rc.26"
2581
+ version = "4.0.0-rc.27"
2582
2582
  dependencies = [
2583
2583
  "async-trait",
2584
2584
  "html-to-markdown-rs",
@@ -1,8 +1,11 @@
1
1
  [workspace]
2
2
 
3
+ [workspace.lints.clippy]
4
+ collapsible_if = "allow"
5
+
3
6
  [package]
4
7
  name = "kreuzberg-rb"
5
- version = "4.0.0-rc.27"
8
+ version = "4.0.0-rc.28"
6
9
  edition = "2024"
7
10
  rust-version = "1.91"
8
11
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -15,6 +18,9 @@ description = "Ruby bindings (Magnus) for Kreuzberg - high-performance document
15
18
  keywords = ["ruby", "magnus", "document", "extraction", "bindings"]
16
19
  categories = ["api-bindings", "text-processing"]
17
20
 
21
+ [lints]
22
+ workspace = true
23
+
18
24
  [lib]
19
25
  name = "kreuzberg_rb"
20
26
  crate-type = ["cdylib", "rlib"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.0-rc.27'
4
+ VERSION = '4.0.0-rc.28'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.0.0-rc.27"
6
+ version = "4.0.0-rc.28"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -26,7 +26,7 @@ tokio = { version = "1.49.0", features = [
26
26
 
27
27
  # Serialization
28
28
  serde = { version = "1.0.228", features = ["derive"] }
29
- serde_json = "1.0.148"
29
+ serde_json = "1.0.149"
30
30
 
31
31
  # Error handling
32
32
  thiserror = "2.0.17"
@@ -47,10 +47,10 @@ hex = "0.4.3"
47
47
  toml = "0.9.10"
48
48
  num_cpus = "1.17.0"
49
49
  once_cell = "1.21.3"
50
- html-to-markdown-rs = { version = "2.19.5", default-features = false }
51
- reqwest = { version = "0.12.28", default-features = false, features = ["json", "rustls-tls"] }
50
+ html-to-markdown-rs = { version = "2.20.0", default-features = false }
51
+ reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
52
52
  image = { version = "0.25.9", default-features = false }
53
- lzma-rust2 = { version = "0.15.4" }
53
+ lzma-rust2 = { version = "0.15.6" }
54
54
 
55
55
  # Testing (dev)
56
56
  tempfile = "3.24.0"
@@ -1,8 +1,7 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.0.0-rc.27"
3
+ version = "4.0.0-rc.28"
4
4
  edition = "2024"
5
- lints.workspace = true
6
5
  rust-version = "1.91"
7
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
8
7
  description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 50+ formats with async/sync APIs."
@@ -145,10 +144,6 @@ tokio = { workspace = true, optional = true }
145
144
  uuid = { version = "1.19.0", features = ["v4", "js"] }
146
145
  indexmap = "2.12.1"
147
146
  tracing = { workspace = true }
148
- reqwest = { workspace = true, default-features = false, features = [
149
- "json",
150
- "rustls-tls",
151
- ], optional = true }
152
147
  pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", features = [
153
148
  "thread_safe",
154
149
  "image_latest",
@@ -188,16 +183,12 @@ image = { workspace = true, default-features = false, features = [
188
183
  "gif",
189
184
  "rayon",
190
185
  ], optional = true }
191
- tiff = { version = "0.10", optional = true }
186
+ tiff = { version = "0.11", optional = true }
192
187
  fast_image_resize = { version = "5.5.0", optional = true }
193
188
  ndarray = { version = "0.17.1", optional = true }
194
189
  kamadak-exif = { version = "0.6.1", optional = true }
195
190
  whatlang = { version = "0.18.0", optional = true }
196
191
  text-splitter = { version = "0.29.3", features = ["markdown"], optional = true }
197
- fastembed = { version = "5.6", default-features = false, features = [
198
- "hf-hub-rustls-tls",
199
- "ort-load-dynamic",
200
- ], optional = true }
201
192
  unicode-normalization = { version = "0.1.25", optional = true }
202
193
  chardetng = { version = "0.1.17", optional = true }
203
194
  encoding_rs = { version = "0.8.35", optional = true }
@@ -225,15 +216,44 @@ tempfile = { workspace = true }
225
216
  filetime = "0.2"
226
217
  tar = "0.4.44"
227
218
  zip = "7.0.0"
228
- serial_test = "3.2.0"
219
+ serial_test = "3.3.1"
229
220
  anyhow = { workspace = true }
230
221
  tokio-test = "0.4"
231
222
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
232
223
  criterion = { workspace = true }
233
224
  image = { workspace = true, default-features = false, features = ["png"] }
234
225
 
235
- [target.'cfg(not(target_os = "windows"))'.dependencies]
226
+ [target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dependencies]
236
227
  pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
228
+ # Use rustls on non-Windows platforms (Linux, macOS)
229
+ reqwest = { workspace = true, default-features = false, features = [
230
+ "json",
231
+ "rustls",
232
+ ], optional = true }
233
+ # Use rustls-tls for fastembed on non-Windows platforms
234
+ fastembed = { version = "5.6", default-features = false, features = [
235
+ "hf-hub-rustls-tls",
236
+ "ort-load-dynamic",
237
+ ], optional = true }
238
+ # Force ureq (transitive dep via hf-hub) to use rustls on non-Windows
239
+ ureq = { version = "2.12", default-features = false, features = ["tls", "json"] }
240
+
241
+ # Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
242
+ [target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies]
243
+ reqwest = { workspace = true, default-features = false, features = [
244
+ "json",
245
+ "native-tls",
246
+ ], optional = true }
247
+ # Use native-tls for fastembed on Windows
248
+ fastembed = { version = "5.6", default-features = false, features = [
249
+ "hf-hub-native-tls",
250
+ "ort-load-dynamic",
251
+ ], optional = true }
252
+ # Force ureq (transitive dep via hf-hub) to use native-tls on Windows
253
+ ureq = { version = "2.12", default-features = false, features = ["native-tls", "json"] }
237
254
 
238
255
  [target.'cfg(target_arch = "wasm32")'.dependencies]
239
256
  wasm-bindgen-rayon = { version = "1.3", optional = true }
257
+ # Override getrandom to enable js feature for WASM targets
258
+ # This is needed because ring/rustls (via ureq) depend on getrandom without js feature
259
+ getrandom = { workspace = true }
@@ -9,7 +9,10 @@ use crate::{batch_extract_bytes, cache, extract_bytes};
9
9
 
10
10
  use super::{
11
11
  error::ApiError,
12
- types::{ApiState, CacheClearResponse, CacheStatsResponse, ExtractResponse, HealthResponse, InfoResponse},
12
+ types::{
13
+ ApiState, CacheClearResponse, CacheStatsResponse, EmbedRequest, EmbedResponse, ExtractResponse, HealthResponse,
14
+ InfoResponse,
15
+ },
13
16
  };
14
17
 
15
18
  /// Extract endpoint handler.
@@ -34,6 +37,14 @@ use super::{
34
37
  ///
35
38
  /// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
36
39
  /// is used as the base, and any per-request config overrides those defaults.
40
+ #[cfg_attr(
41
+ feature = "otel",
42
+ tracing::instrument(
43
+ name = "api.extract",
44
+ skip(state, multipart),
45
+ fields(files_count = tracing::field::Empty)
46
+ )
47
+ )]
37
48
  pub async fn extract_handler(
38
49
  State(state): State<ApiState>,
39
50
  mut multipart: Multipart,
@@ -84,6 +95,9 @@ pub async fn extract_handler(
84
95
  )));
85
96
  }
86
97
 
98
+ #[cfg(feature = "otel")]
99
+ tracing::Span::current().record("files_count", files.len());
100
+
87
101
  if files.len() == 1 {
88
102
  let (data, mime_type, _file_name) = files
89
103
  .into_iter()
@@ -102,6 +116,7 @@ pub async fn extract_handler(
102
116
  /// Health check endpoint handler.
103
117
  ///
104
118
  /// GET /health
119
+ #[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
105
120
  pub async fn health_handler() -> Json<HealthResponse> {
106
121
  Json(HealthResponse {
107
122
  status: "healthy".to_string(),
@@ -112,6 +127,7 @@ pub async fn health_handler() -> Json<HealthResponse> {
112
127
  /// Server info endpoint handler.
113
128
  ///
114
129
  /// GET /info
130
+ #[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
115
131
  pub async fn info_handler() -> Json<InfoResponse> {
116
132
  Json(InfoResponse {
117
133
  version: env!("CARGO_PKG_VERSION").to_string(),
@@ -129,6 +145,7 @@ pub async fn info_handler() -> Json<InfoResponse> {
129
145
  /// - Current directory cannot be determined
130
146
  /// - Cache directory path contains non-UTF8 characters
131
147
  /// - Cache metadata retrieval fails
148
+ #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
132
149
  pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
133
150
  let cache_dir = std::env::current_dir()
134
151
  .map_err(|e| {
@@ -168,6 +185,7 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
168
185
  /// - Current directory cannot be determined
169
186
  /// - Cache directory path contains non-UTF8 characters
170
187
  /// - Cache clearing operation fails
188
+ #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
171
189
  pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
172
190
  let cache_dir = std::env::current_dir()
173
191
  .map_err(|e| {
@@ -193,3 +211,110 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
193
211
  freed_mb,
194
212
  }))
195
213
  }
214
+
215
+ /// Embedding endpoint handler.
216
+ ///
217
+ /// POST /embed
218
+ ///
219
+ /// Accepts JSON body with:
220
+ /// - `texts`: Array of strings to generate embeddings for
221
+ /// - `config` (optional): Embedding configuration (model, batch size, cache_dir)
222
+ ///
223
+ /// Returns embeddings for each input text.
224
+ ///
225
+ /// # Errors
226
+ ///
227
+ /// Returns `ApiError::Internal` if:
228
+ /// - Embeddings feature is not enabled
229
+ /// - ONNX Runtime is not available
230
+ /// - Model initialization fails
231
+ /// - Embedding generation fails
232
+ #[cfg(feature = "embeddings")]
233
+ #[cfg_attr(
234
+ feature = "otel",
235
+ tracing::instrument(
236
+ name = "api.embed",
237
+ skip(request),
238
+ fields(
239
+ texts_count = request.texts.len(),
240
+ model = tracing::field::Empty
241
+ )
242
+ )
243
+ )]
244
+ pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
245
+ use crate::types::{Chunk, ChunkMetadata};
246
+
247
+ if request.texts.is_empty() {
248
+ return Err(ApiError::validation(crate::error::KreuzbergError::validation(
249
+ "No texts provided for embedding generation",
250
+ )));
251
+ }
252
+
253
+ // Use default config if none provided
254
+ let config = request.config.unwrap_or_default();
255
+
256
+ // Create chunks from input texts
257
+ let mut chunks: Vec<Chunk> = request
258
+ .texts
259
+ .iter()
260
+ .enumerate()
261
+ .map(|(idx, text)| Chunk {
262
+ content: text.clone(),
263
+ embedding: None,
264
+ metadata: ChunkMetadata {
265
+ byte_start: 0,
266
+ byte_end: text.len(),
267
+ token_count: None,
268
+ chunk_index: idx,
269
+ total_chunks: request.texts.len(),
270
+ first_page: None,
271
+ last_page: None,
272
+ },
273
+ })
274
+ .collect();
275
+
276
+ // Generate embeddings
277
+ crate::embeddings::generate_embeddings_for_chunks(&mut chunks, &config).map_err(ApiError::internal)?;
278
+
279
+ // Extract embeddings from chunks
280
+ let embeddings: Vec<Vec<f32>> = chunks
281
+ .into_iter()
282
+ .map(|chunk| {
283
+ chunk.embedding.ok_or_else(|| {
284
+ ApiError::internal(crate::error::KreuzbergError::Other(
285
+ "Failed to generate embedding for text".to_string(),
286
+ ))
287
+ })
288
+ })
289
+ .collect::<Result<Vec<_>, _>>()?;
290
+
291
+ let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
292
+
293
+ // Get model name from config
294
+ let model_name = match &config.model {
295
+ crate::core::config::EmbeddingModelType::Preset { name } => name.clone(),
296
+ #[cfg(feature = "embeddings")]
297
+ crate::core::config::EmbeddingModelType::FastEmbed { model, .. } => model.clone(),
298
+ crate::core::config::EmbeddingModelType::Custom { .. } => "custom".to_string(),
299
+ };
300
+
301
+ #[cfg(feature = "otel")]
302
+ tracing::Span::current().record("model", &model_name);
303
+
304
+ Ok(Json(EmbedResponse {
305
+ embeddings,
306
+ model: model_name,
307
+ dimensions,
308
+ count: request.texts.len(),
309
+ }))
310
+ }
311
+
312
+ /// Embedding endpoint handler (when embeddings feature is disabled).
313
+ ///
314
+ /// Returns an error indicating embeddings feature is not enabled.
315
+ #[cfg(not(feature = "embeddings"))]
316
+ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
317
+ Err(ApiError::internal(crate::error::KreuzbergError::MissingDependency(
318
+ "Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
319
+ )))
320
+ }
@@ -6,6 +6,7 @@
6
6
  //! # Endpoints
7
7
  //!
8
8
  //! - `POST /extract` - Extract text from uploaded files (multipart form data)
9
+ //! - `POST /embed` - Generate embeddings for text (JSON body with texts array)
9
10
  //! - `GET /health` - Health check endpoint
10
11
  //! - `GET /info` - Server information
11
12
  //! - `GET /cache/stats` - Get cache statistics
@@ -70,6 +71,11 @@
70
71
  //!
71
72
  //! # Clear cache
72
73
  //! curl -X DELETE http://localhost:8000/cache/clear
74
+ //!
75
+ //! # Generate embeddings
76
+ //! curl -X POST http://localhost:8000/embed \
77
+ //! -H "Content-Type: application/json" \
78
+ //! -d '{"texts":["Hello world","Second text"]}'
73
79
  //! ```
74
80
 
75
81
  mod error;
@@ -79,9 +85,10 @@ mod types;
79
85
 
80
86
  pub use error::ApiError;
81
87
  pub use server::{
82
- create_router, create_router_with_limits, serve, serve_default, serve_with_config, serve_with_config_and_limits,
88
+ create_router, create_router_with_limits, create_router_with_limits_and_server_config, load_server_config, serve,
89
+ serve_default, serve_with_config, serve_with_config_and_limits, serve_with_server_config,
83
90
  };
84
91
  pub use types::{
85
- ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, ErrorResponse, ExtractResponse, HealthResponse,
86
- InfoResponse,
92
+ ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, EmbedRequest, EmbedResponse, ErrorResponse,
93
+ ExtractResponse, HealthResponse, InfoResponse,
87
94
  };