kreuzberg 4.2.10 → 4.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +6 -6
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/extractor/batch.rs +29 -8
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +1 -0
- data/vendor/kreuzberg/src/types/metadata.rs +7 -0
- data/vendor/kreuzberg/tests/api_embed.rs +5 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e8a57a9be868cea7df0628ac25c7f0aabea6ced5368b0ec69452abd8ace56cd9
|
|
4
|
+
data.tar.gz: dde627c6aff4ae060d53e5b44145bdaf6cfc8af870294f7521c200cf2b7e10ba
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1c322dfecd4829e4e3aa13bbdd298f3f06f62877362867cc5795cc0690ef6b632ecbbac515f20c288ae3457ab5a022cb116b4922abc69a11e06151774f6f91f0
|
|
7
|
+
data.tar.gz: 50e8a2b5489f169afb9f6b60150954463ed31988b049607c86ac501051df8138e21eec3f04d1e7b8a68f00079078416f7b3a5c2e7d93bf6bfb2bb8313c8e8aa3
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.11)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -123,7 +123,7 @@ GEM
|
|
|
123
123
|
rubocop (~> 1.81)
|
|
124
124
|
ruby-progressbar (1.13.0)
|
|
125
125
|
securerandom (0.4.1)
|
|
126
|
-
sorbet-runtime (0.6.
|
|
126
|
+
sorbet-runtime (0.6.12915)
|
|
127
127
|
steep (1.10.0)
|
|
128
128
|
activesupport (>= 5.1)
|
|
129
129
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -209,7 +209,7 @@ CHECKSUMS
|
|
|
209
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
210
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
211
211
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
212
|
-
kreuzberg (4.2.
|
|
212
|
+
kreuzberg (4.2.11)
|
|
213
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
214
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
215
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -244,7 +244,7 @@ CHECKSUMS
|
|
|
244
244
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
245
245
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
246
246
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
247
|
-
sorbet-runtime (0.6.
|
|
247
|
+
sorbet-runtime (0.6.12915) sha256=21d2866b1edfe57c97d22f36db5bcf2db311f84290e56152e9faf4b4915aa315
|
|
248
248
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
249
249
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
250
250
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.11" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.11"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -151,10 +151,10 @@ pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", featur
|
|
|
151
151
|
"image_latest",
|
|
152
152
|
], optional = true }
|
|
153
153
|
lopdf = { version = "0.39.0", optional = true }
|
|
154
|
-
calamine = { version = "0.
|
|
154
|
+
calamine = { version = "0.33.0", features = ["dates"], optional = true }
|
|
155
155
|
polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
|
|
156
156
|
roxmltree = { version = "0.21.1", optional = true }
|
|
157
|
-
zip = { version = "7.
|
|
157
|
+
zip = { version = "7.3.0", optional = true }
|
|
158
158
|
mail-parser = { version = "0.11.1", optional = true }
|
|
159
159
|
msg_parser = { version = "0.1.1", optional = true }
|
|
160
160
|
html-to-markdown-rs = { workspace = true, features = [
|
|
@@ -218,7 +218,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
|
|
|
218
218
|
tempfile = { workspace = true }
|
|
219
219
|
filetime = "0.2"
|
|
220
220
|
tar = "0.4.44"
|
|
221
|
-
zip = "7.
|
|
221
|
+
zip = "7.3.0"
|
|
222
222
|
serial_test = "3.3.1"
|
|
223
223
|
anyhow = { workspace = true }
|
|
224
224
|
tokio-test = "0.4"
|
|
@@ -239,7 +239,7 @@ fastembed = { version = "5.8", default-features = false, features = [
|
|
|
239
239
|
"ort-load-dynamic",
|
|
240
240
|
], optional = true }
|
|
241
241
|
# Force ureq (transitive dep via hf-hub) to use rustls on non-Windows
|
|
242
|
-
ureq = { version = "3.
|
|
242
|
+
ureq = { version = "3.2", default-features = false, features = ["rustls", "json"] }
|
|
243
243
|
|
|
244
244
|
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
|
245
245
|
[target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies]
|
|
@@ -253,7 +253,7 @@ fastembed = { version = "5.8", default-features = false, features = [
|
|
|
253
253
|
"ort-load-dynamic",
|
|
254
254
|
], optional = true }
|
|
255
255
|
# Force ureq (transitive dep via hf-hub) to use native-tls on Windows
|
|
256
|
-
ureq = { version = "3.
|
|
256
|
+
ureq = { version = "3.2", default-features = false, features = ["native-tls", "json"] }
|
|
257
257
|
|
|
258
258
|
[target.'cfg(target_arch = "wasm32")'.dependencies]
|
|
259
259
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.11 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -9,6 +9,7 @@ use crate::{KreuzbergError, Result};
|
|
|
9
9
|
use std::borrow::Cow;
|
|
10
10
|
use std::path::Path;
|
|
11
11
|
use std::sync::Arc;
|
|
12
|
+
use std::time::Instant;
|
|
12
13
|
|
|
13
14
|
use super::bytes::extract_bytes;
|
|
14
15
|
use super::file::extract_file;
|
|
@@ -82,10 +83,18 @@ pub async fn batch_extract_file(
|
|
|
82
83
|
|
|
83
84
|
tasks.spawn(async move {
|
|
84
85
|
let _permit = semaphore_clone.acquire().await.unwrap();
|
|
85
|
-
let
|
|
86
|
+
let start = Instant::now();
|
|
87
|
+
let mut result =
|
|
86
88
|
crate::core::batch_mode::with_batch_mode(async { extract_file(&path_buf, None, &config_clone).await })
|
|
87
89
|
.await;
|
|
88
|
-
(
|
|
90
|
+
let elapsed_ms = start.elapsed().as_millis() as u64;
|
|
91
|
+
|
|
92
|
+
// Add extraction timing to result metadata for benchmarking
|
|
93
|
+
if let Ok(ref mut r) = result {
|
|
94
|
+
r.metadata.extraction_duration_ms = Some(elapsed_ms);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
(index, result, elapsed_ms)
|
|
89
98
|
});
|
|
90
99
|
}
|
|
91
100
|
|
|
@@ -93,10 +102,11 @@ pub async fn batch_extract_file(
|
|
|
93
102
|
|
|
94
103
|
while let Some(task_result) = tasks.join_next().await {
|
|
95
104
|
match task_result {
|
|
96
|
-
Ok((index, Ok(result))) => {
|
|
105
|
+
Ok((index, Ok(result), _elapsed_ms)) => {
|
|
106
|
+
// Timing already added to result.metadata.extraction_duration_ms
|
|
97
107
|
results[index] = Some(result);
|
|
98
108
|
}
|
|
99
|
-
Ok((index, Err(e))) => {
|
|
109
|
+
Ok((index, Err(e), elapsed_ms)) => {
|
|
100
110
|
// All errors (including Io) should create error results
|
|
101
111
|
// instead of causing early return that abandons running tasks
|
|
102
112
|
let metadata = Metadata {
|
|
@@ -104,6 +114,7 @@ pub async fn batch_extract_file(
|
|
|
104
114
|
error_type: format!("{:?}", e),
|
|
105
115
|
message: e.to_string(),
|
|
106
116
|
}),
|
|
117
|
+
extraction_duration_ms: Some(elapsed_ms),
|
|
107
118
|
..Default::default()
|
|
108
119
|
};
|
|
109
120
|
|
|
@@ -196,11 +207,19 @@ pub async fn batch_extract_bytes(
|
|
|
196
207
|
|
|
197
208
|
tasks.spawn(async move {
|
|
198
209
|
let _permit = semaphore_clone.acquire().await.unwrap();
|
|
199
|
-
let
|
|
210
|
+
let start = Instant::now();
|
|
211
|
+
let mut result = crate::core::batch_mode::with_batch_mode(async {
|
|
200
212
|
extract_bytes(&bytes, &mime_type, &config_clone).await
|
|
201
213
|
})
|
|
202
214
|
.await;
|
|
203
|
-
(
|
|
215
|
+
let elapsed_ms = start.elapsed().as_millis() as u64;
|
|
216
|
+
|
|
217
|
+
// Add extraction timing to result metadata for benchmarking
|
|
218
|
+
if let Ok(ref mut r) = result {
|
|
219
|
+
r.metadata.extraction_duration_ms = Some(elapsed_ms);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
(index, result, elapsed_ms)
|
|
204
223
|
});
|
|
205
224
|
}
|
|
206
225
|
|
|
@@ -208,10 +227,11 @@ pub async fn batch_extract_bytes(
|
|
|
208
227
|
|
|
209
228
|
while let Some(task_result) = tasks.join_next().await {
|
|
210
229
|
match task_result {
|
|
211
|
-
Ok((index, Ok(result))) => {
|
|
230
|
+
Ok((index, Ok(result), _elapsed_ms)) => {
|
|
231
|
+
// Timing already added to result.metadata.extraction_duration_ms
|
|
212
232
|
results[index] = Some(result);
|
|
213
233
|
}
|
|
214
|
-
Ok((index, Err(e))) => {
|
|
234
|
+
Ok((index, Err(e), elapsed_ms)) => {
|
|
215
235
|
// All errors (including Io) should create error results
|
|
216
236
|
// instead of causing early return that abandons running tasks
|
|
217
237
|
let metadata = Metadata {
|
|
@@ -219,6 +239,7 @@ pub async fn batch_extract_bytes(
|
|
|
219
239
|
error_type: format!("{:?}", e),
|
|
220
240
|
message: e.to_string(),
|
|
221
241
|
}),
|
|
242
|
+
extraction_duration_ms: Some(elapsed_ms),
|
|
222
243
|
..Default::default()
|
|
223
244
|
};
|
|
224
245
|
|
|
@@ -132,6 +132,13 @@ pub struct Metadata {
|
|
|
132
132
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
133
133
|
pub error: Option<ErrorMetadata>,
|
|
134
134
|
|
|
135
|
+
/// Extraction duration in milliseconds (for benchmarking).
|
|
136
|
+
///
|
|
137
|
+
/// This field is populated by batch extraction to provide per-file timing
|
|
138
|
+
/// information. It's `None` for single-file extraction (which uses external timing).
|
|
139
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
140
|
+
pub extraction_duration_ms: Option<u64>,
|
|
141
|
+
|
|
135
142
|
/// Additional custom fields from postprocessors.
|
|
136
143
|
///
|
|
137
144
|
/// This flattened map allows Python/TypeScript postprocessors to add
|
|
@@ -83,6 +83,7 @@ async fn test_embed_empty_texts() {
|
|
|
83
83
|
|
|
84
84
|
/// Test embed endpoint with custom embedding configuration.
|
|
85
85
|
#[tokio::test]
|
|
86
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
86
87
|
async fn test_embed_with_custom_config() {
|
|
87
88
|
let app = create_router(ExtractionConfig::default());
|
|
88
89
|
|
|
@@ -125,6 +126,7 @@ async fn test_embed_with_custom_config() {
|
|
|
125
126
|
|
|
126
127
|
/// Test embed endpoint with single text.
|
|
127
128
|
#[tokio::test]
|
|
129
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
128
130
|
async fn test_embed_single_text() {
|
|
129
131
|
let app = create_router(ExtractionConfig::default());
|
|
130
132
|
|
|
@@ -201,6 +203,7 @@ async fn test_embed_batch() {
|
|
|
201
203
|
|
|
202
204
|
/// Test embed endpoint with long text.
|
|
203
205
|
#[tokio::test]
|
|
206
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
204
207
|
async fn test_embed_long_text() {
|
|
205
208
|
let app = create_router(ExtractionConfig::default());
|
|
206
209
|
|
|
@@ -317,6 +320,7 @@ async fn test_embed_rejects_simple_json_array() {
|
|
|
317
320
|
|
|
318
321
|
/// Test embed endpoint preserves embedding vector values across calls.
|
|
319
322
|
#[tokio::test]
|
|
323
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
320
324
|
async fn test_embed_deterministic() {
|
|
321
325
|
let app = create_router(ExtractionConfig::default());
|
|
322
326
|
|
|
@@ -376,6 +380,7 @@ async fn test_embed_deterministic() {
|
|
|
376
380
|
|
|
377
381
|
/// Test embed endpoint with different embedding presets.
|
|
378
382
|
#[tokio::test]
|
|
383
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
379
384
|
async fn test_embed_different_presets() {
|
|
380
385
|
let app = create_router(ExtractionConfig::default());
|
|
381
386
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-tesseract"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.11"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -26,7 +26,7 @@ image = { workspace = true, features = ["png"] }
|
|
|
26
26
|
[build-dependencies]
|
|
27
27
|
cc = { version = "^1.2.55", optional = true }
|
|
28
28
|
cmake = { version = "0.1.57", optional = true }
|
|
29
|
-
zip = { version = "7.
|
|
29
|
+
zip = { version = "7.3.0", optional = true }
|
|
30
30
|
|
|
31
31
|
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
|
32
32
|
[target.'cfg(target_os = "windows")'.build-dependencies]
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.2.
|
|
4
|
+
version: 4.2.11
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-02-
|
|
11
|
+
date: 2026-02-06 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|