kreuzberg 4.2.10 → 4.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: abf625c4f7eedb0ba24619d640ac572192a112bf29876c25c662c8faf8a7219c
4
- data.tar.gz: 460cdf492f802db89332e989340070448c5b60bb44ce0860a1104889814bb9ac
3
+ metadata.gz: e8a57a9be868cea7df0628ac25c7f0aabea6ced5368b0ec69452abd8ace56cd9
4
+ data.tar.gz: dde627c6aff4ae060d53e5b44145bdaf6cfc8af870294f7521c200cf2b7e10ba
5
5
  SHA512:
6
- metadata.gz: 6e9b8b00347a73747e7ab8aad698f2d7a5798609dd1b086fe6df3a723c49bd05c5dff3c8ad0e7c83720cc3944b1a9d66fdec710405c9f1e22e43fe55387cdc92
7
- data.tar.gz: dab907905f37a8fbc13d4c3e7e893cf6162fe57c6d735bfe08dfec32ea721706ab683d1baa4cfa6b0db4db8c393e7909ee3cf98195881ea31c8dc5ce0cda0b6a
6
+ metadata.gz: 1c322dfecd4829e4e3aa13bbdd298f3f06f62877362867cc5795cc0690ef6b632ecbbac515f20c288ae3457ab5a022cb116b4922abc69a11e06151774f6f91f0
7
+ data.tar.gz: 50e8a2b5489f169afb9f6b60150954463ed31988b049607c86ac501051df8138e21eec3f04d1e7b8a68f00079078416f7b3a5c2e7d93bf6bfb2bb8313c8e8aa3
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.10)
4
+ kreuzberg (4.2.11)
5
5
  rb_sys (~> 0.9.119)
6
6
 
7
7
  GEM
@@ -123,7 +123,7 @@ GEM
123
123
  rubocop (~> 1.81)
124
124
  ruby-progressbar (1.13.0)
125
125
  securerandom (0.4.1)
126
- sorbet-runtime (0.6.12914)
126
+ sorbet-runtime (0.6.12915)
127
127
  steep (1.10.0)
128
128
  activesupport (>= 5.1)
129
129
  concurrent-ruby (>= 1.1.10)
@@ -209,7 +209,7 @@ CHECKSUMS
209
209
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
210
210
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
211
211
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
212
- kreuzberg (4.2.10)
212
+ kreuzberg (4.2.11)
213
213
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
214
214
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
215
215
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -244,7 +244,7 @@ CHECKSUMS
244
244
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
245
245
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
246
246
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
247
- sorbet-runtime (0.6.12914) sha256=6d3c985d671dab9ab8ea244b51888b6e8e8e65e881e5bf816d1ac0950479dce6
247
+ sorbet-runtime (0.6.12915) sha256=21d2866b1edfe57c97d22f36db5bcf2db311f84290e56152e9faf4b4915aa315
248
248
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
249
249
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
250
250
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.10" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.11" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -2755,7 +2755,7 @@ dependencies = [
2755
2755
 
2756
2756
  [[package]]
2757
2757
  name = "kreuzberg-rb"
2758
- version = "4.0.0"
2758
+ version = "4.2.10"
2759
2759
  dependencies = [
2760
2760
  "async-trait",
2761
2761
  "html-to-markdown-rs",
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.2.10"
40
+ version = "4.2.11"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.10'
4
+ VERSION = '4.2.11'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.10"
6
+ version = "4.2.11"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.10"
3
+ version = "4.2.11"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -151,10 +151,10 @@ pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", featur
151
151
  "image_latest",
152
152
  ], optional = true }
153
153
  lopdf = { version = "0.39.0", optional = true }
154
- calamine = { version = "0.32.0", features = ["dates"], optional = true }
154
+ calamine = { version = "0.33.0", features = ["dates"], optional = true }
155
155
  polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
156
156
  roxmltree = { version = "0.21.1", optional = true }
157
- zip = { version = "7.2.0", optional = true }
157
+ zip = { version = "7.3.0", optional = true }
158
158
  mail-parser = { version = "0.11.1", optional = true }
159
159
  msg_parser = { version = "0.1.1", optional = true }
160
160
  html-to-markdown-rs = { workspace = true, features = [
@@ -218,7 +218,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
218
218
  tempfile = { workspace = true }
219
219
  filetime = "0.2"
220
220
  tar = "0.4.44"
221
- zip = "7.2.0"
221
+ zip = "7.3.0"
222
222
  serial_test = "3.3.1"
223
223
  anyhow = { workspace = true }
224
224
  tokio-test = "0.4"
@@ -239,7 +239,7 @@ fastembed = { version = "5.8", default-features = false, features = [
239
239
  "ort-load-dynamic",
240
240
  ], optional = true }
241
241
  # Force ureq (transitive dep via hf-hub) to use rustls on non-Windows
242
- ureq = { version = "3.1", default-features = false, features = ["rustls", "json"] }
242
+ ureq = { version = "3.2", default-features = false, features = ["rustls", "json"] }
243
243
 
244
244
  # Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
245
245
  [target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies]
@@ -253,7 +253,7 @@ fastembed = { version = "5.8", default-features = false, features = [
253
253
  "ort-load-dynamic",
254
254
  ], optional = true }
255
255
  # Force ureq (transitive dep via hf-hub) to use native-tls on Windows
256
- ureq = { version = "3.1", default-features = false, features = ["native-tls", "json"] }
256
+ ureq = { version = "3.2", default-features = false, features = ["native-tls", "json"] }
257
257
 
258
258
  [target.'cfg(target_arch = "wasm32")'.dependencies]
259
259
  wasm-bindgen-rayon = { version = "1.3", optional = true }
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.10 Release**
20
+ > **🚀 Version 4.2.11 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -9,6 +9,7 @@ use crate::{KreuzbergError, Result};
9
9
  use std::borrow::Cow;
10
10
  use std::path::Path;
11
11
  use std::sync::Arc;
12
+ use std::time::Instant;
12
13
 
13
14
  use super::bytes::extract_bytes;
14
15
  use super::file::extract_file;
@@ -82,10 +83,18 @@ pub async fn batch_extract_file(
82
83
 
83
84
  tasks.spawn(async move {
84
85
  let _permit = semaphore_clone.acquire().await.unwrap();
85
- let result =
86
+ let start = Instant::now();
87
+ let mut result =
86
88
  crate::core::batch_mode::with_batch_mode(async { extract_file(&path_buf, None, &config_clone).await })
87
89
  .await;
88
- (index, result)
90
+ let elapsed_ms = start.elapsed().as_millis() as u64;
91
+
92
+ // Add extraction timing to result metadata for benchmarking
93
+ if let Ok(ref mut r) = result {
94
+ r.metadata.extraction_duration_ms = Some(elapsed_ms);
95
+ }
96
+
97
+ (index, result, elapsed_ms)
89
98
  });
90
99
  }
91
100
 
@@ -93,10 +102,11 @@ pub async fn batch_extract_file(
93
102
 
94
103
  while let Some(task_result) = tasks.join_next().await {
95
104
  match task_result {
96
- Ok((index, Ok(result))) => {
105
+ Ok((index, Ok(result), _elapsed_ms)) => {
106
+ // Timing already added to result.metadata.extraction_duration_ms
97
107
  results[index] = Some(result);
98
108
  }
99
- Ok((index, Err(e))) => {
109
+ Ok((index, Err(e), elapsed_ms)) => {
100
110
  // All errors (including Io) should create error results
101
111
  // instead of causing early return that abandons running tasks
102
112
  let metadata = Metadata {
@@ -104,6 +114,7 @@ pub async fn batch_extract_file(
104
114
  error_type: format!("{:?}", e),
105
115
  message: e.to_string(),
106
116
  }),
117
+ extraction_duration_ms: Some(elapsed_ms),
107
118
  ..Default::default()
108
119
  };
109
120
 
@@ -196,11 +207,19 @@ pub async fn batch_extract_bytes(
196
207
 
197
208
  tasks.spawn(async move {
198
209
  let _permit = semaphore_clone.acquire().await.unwrap();
199
- let result = crate::core::batch_mode::with_batch_mode(async {
210
+ let start = Instant::now();
211
+ let mut result = crate::core::batch_mode::with_batch_mode(async {
200
212
  extract_bytes(&bytes, &mime_type, &config_clone).await
201
213
  })
202
214
  .await;
203
- (index, result)
215
+ let elapsed_ms = start.elapsed().as_millis() as u64;
216
+
217
+ // Add extraction timing to result metadata for benchmarking
218
+ if let Ok(ref mut r) = result {
219
+ r.metadata.extraction_duration_ms = Some(elapsed_ms);
220
+ }
221
+
222
+ (index, result, elapsed_ms)
204
223
  });
205
224
  }
206
225
 
@@ -208,10 +227,11 @@ pub async fn batch_extract_bytes(
208
227
 
209
228
  while let Some(task_result) = tasks.join_next().await {
210
229
  match task_result {
211
- Ok((index, Ok(result))) => {
230
+ Ok((index, Ok(result), _elapsed_ms)) => {
231
+ // Timing already added to result.metadata.extraction_duration_ms
212
232
  results[index] = Some(result);
213
233
  }
214
- Ok((index, Err(e))) => {
234
+ Ok((index, Err(e), elapsed_ms)) => {
215
235
  // All errors (including Io) should create error results
216
236
  // instead of causing early return that abandons running tasks
217
237
  let metadata = Metadata {
@@ -219,6 +239,7 @@ pub async fn batch_extract_bytes(
219
239
  error_type: format!("{:?}", e),
220
240
  message: e.to_string(),
221
241
  }),
242
+ extraction_duration_ms: Some(elapsed_ms),
222
243
  ..Default::default()
223
244
  };
224
245
 
@@ -253,6 +253,7 @@ mod tests {
253
253
  image_preprocessing: None,
254
254
  json_schema: None,
255
255
  error: None,
256
+ extraction_duration_ms: None,
256
257
  additional: Default::default(),
257
258
  }
258
259
  }
@@ -132,6 +132,13 @@ pub struct Metadata {
132
132
  #[serde(skip_serializing_if = "Option::is_none")]
133
133
  pub error: Option<ErrorMetadata>,
134
134
 
135
+ /// Extraction duration in milliseconds (for benchmarking).
136
+ ///
137
+ /// This field is populated by batch extraction to provide per-file timing
138
+ /// information. It's `None` for single-file extraction (which uses external timing).
139
+ #[serde(skip_serializing_if = "Option::is_none")]
140
+ pub extraction_duration_ms: Option<u64>,
141
+
135
142
  /// Additional custom fields from postprocessors.
136
143
  ///
137
144
  /// This flattened map allows Python/TypeScript postprocessors to add
@@ -83,6 +83,7 @@ async fn test_embed_empty_texts() {
83
83
 
84
84
  /// Test embed endpoint with custom embedding configuration.
85
85
  #[tokio::test]
86
+ #[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
86
87
  async fn test_embed_with_custom_config() {
87
88
  let app = create_router(ExtractionConfig::default());
88
89
 
@@ -125,6 +126,7 @@ async fn test_embed_with_custom_config() {
125
126
 
126
127
  /// Test embed endpoint with single text.
127
128
  #[tokio::test]
129
+ #[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
128
130
  async fn test_embed_single_text() {
129
131
  let app = create_router(ExtractionConfig::default());
130
132
 
@@ -201,6 +203,7 @@ async fn test_embed_batch() {
201
203
 
202
204
  /// Test embed endpoint with long text.
203
205
  #[tokio::test]
206
+ #[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
204
207
  async fn test_embed_long_text() {
205
208
  let app = create_router(ExtractionConfig::default());
206
209
 
@@ -317,6 +320,7 @@ async fn test_embed_rejects_simple_json_array() {
317
320
 
318
321
  /// Test embed endpoint preserves embedding vector values across calls.
319
322
  #[tokio::test]
323
+ #[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
320
324
  async fn test_embed_deterministic() {
321
325
  let app = create_router(ExtractionConfig::default());
322
326
 
@@ -376,6 +380,7 @@ async fn test_embed_deterministic() {
376
380
 
377
381
  /// Test embed endpoint with different embedding presets.
378
382
  #[tokio::test]
383
+ #[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
379
384
  async fn test_embed_different_presets() {
380
385
  let app = create_router(ExtractionConfig::default());
381
386
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.10"
3
+ version = "4.2.11"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -26,7 +26,7 @@ image = { workspace = true, features = ["png"] }
26
26
  [build-dependencies]
27
27
  cc = { version = "^1.2.55", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
- zip = { version = "7.2.0", optional = true }
29
+ zip = { version = "7.3.0", optional = true }
30
30
 
31
31
  # Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
32
32
  [target.'cfg(target_os = "windows")'.build-dependencies]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.10
4
+ version: 4.2.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-05 00:00:00.000000000 Z
11
+ date: 2026-02-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys