kreuzberg 4.9.7 → 4.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +5 -5
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +12 -13
- data/vendor/kreuzberg/Cargo.toml +37 -31
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/pdf.rs +2 -5
- data/vendor/kreuzberg/src/core/config/tree_sitter.rs +0 -1
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +6 -1
- data/vendor/kreuzberg/src/core/extractor/file.rs +6 -1
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +115 -15
- data/vendor/kreuzberg/src/embeddings/mod.rs +17 -13
- data/vendor/kreuzberg/src/extraction/email.rs +58 -7
- data/vendor/kreuzberg/src/extraction/image_ocr.rs +72 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +0 -168
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +1 -410
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +41 -15
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +45 -1
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +13 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +62 -11
- data/vendor/kreuzberg/src/llm/structured.rs +22 -17
- data/vendor/kreuzberg/src/llm/vlm_ocr.rs +11 -6
- data/vendor/kreuzberg/src/pdf/images.rs +22 -4
- data/vendor/kreuzberg/src/pdf/mod.rs +0 -16
- data/vendor/kreuzberg/src/pdf/rendering.rs +53 -6
- data/vendor/kreuzberg/src/pdf/structure/mod.rs +0 -2
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +12 -890
- data/vendor/kreuzberg/src/table_core.rs +8 -1
- data/vendor/kreuzberg/tests/extraction_timeout_tests.rs +26 -0
- data/vendor/kreuzberg/tests/pdf_markdown_quality.rs +1 -2
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +35 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +7 -7
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-ffi/src/config/loader.rs +39 -24
- data/vendor/kreuzberg-ffi/src/config/mod.rs +0 -4
- data/vendor/kreuzberg-ffi/src/lib.rs +1 -2
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +2 -2
- data/vendor/kreuzberg-paddle-ocr/src/ocr_utils.rs +3 -3
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +4 -4
- metadata +2 -10
- data/vendor/kreuzberg/src/pdf/oxide/annotations.rs +0 -258
- data/vendor/kreuzberg/src/pdf/oxide/hierarchy.rs +0 -235
- data/vendor/kreuzberg/src/pdf/oxide/images.rs +0 -53
- data/vendor/kreuzberg/src/pdf/oxide/metadata.rs +0 -381
- data/vendor/kreuzberg/src/pdf/oxide/mod.rs +0 -43
- data/vendor/kreuzberg/src/pdf/oxide/table.rs +0 -247
- data/vendor/kreuzberg/src/pdf/oxide/text.rs +0 -250
- data/vendor/kreuzberg/src/pdf/oxide_text.rs +0 -121
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: de92334e109bbca1bdd22469a651f146bf29eee730dfc841c3dcb4703ee3ba5b
|
|
4
|
+
data.tar.gz: 53140e24511ff0910814325859b3e7382ee9f511e2412181812607f0f3516f33
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 80b7a6fa716b1adf28d543074581d5f88984aae738669e52116c49d301b1116ed7311c4bcfa90d1853a9d98752b0d2ad13b5c0e14e9f2623217ff319c007eca2
|
|
7
|
+
data.tar.gz: fab95d4048b382cf3ff4e154d7979bc4963e35e57c802cd5f871950d460ef16c5745c66931fbbf15b02a479f0a991a9e1832a126d445c002c7bf79b92ae143b9
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.9" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.9"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -49,13 +49,13 @@ kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
|
|
|
49
49
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
50
50
|
"rb-sys",
|
|
51
51
|
] }
|
|
52
|
-
rb-sys = { version = "0.9.
|
|
52
|
+
rb-sys = { version = "0.9.128", default-features = false, features = [
|
|
53
53
|
"stable-api-compiled-fallback",
|
|
54
54
|
] }
|
|
55
|
-
serde_json = "1.0.
|
|
55
|
+
serde_json = "1.0.150"
|
|
56
56
|
toml = "1.1.2"
|
|
57
57
|
serde_yaml_ng = "0.10"
|
|
58
|
-
tokio = { version = "1.52.
|
|
58
|
+
tokio = { version = "1.52.3", features = [
|
|
59
59
|
"rt",
|
|
60
60
|
"rt-multi-thread",
|
|
61
61
|
"macros",
|
|
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
|
|
|
65
65
|
"time",
|
|
66
66
|
"io-util",
|
|
67
67
|
] }
|
|
68
|
-
html-to-markdown-rs = { version = "3.
|
|
68
|
+
html-to-markdown-rs = { version = "3.5.7", default-features = false }
|
|
69
69
|
|
|
70
70
|
[dev-dependencies]
|
|
71
71
|
pretty_assertions = "1.4"
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.9.
|
|
5
|
+
version = "4.9.9"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -23,41 +23,40 @@ clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
|
|
|
23
23
|
comrak = { version = "0.52", default-features = false }
|
|
24
24
|
console_error_panic_hook = "0.1"
|
|
25
25
|
criterion = { version = "0.8", features = ["html_reports"] }
|
|
26
|
-
ctor = "0
|
|
26
|
+
ctor = "1.0"
|
|
27
27
|
dbase = "0.7"
|
|
28
28
|
futures = "0.3"
|
|
29
29
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
30
30
|
hex = "0.4.3"
|
|
31
|
-
html-to-markdown-rs = { version = "3.
|
|
31
|
+
html-to-markdown-rs = { version = "3.5.7", default-features = false }
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.9", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.9" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
|
-
libc = "0.2.
|
|
39
|
-
liter-llm = { version = "1.
|
|
38
|
+
libc = "0.2.186"
|
|
39
|
+
liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false }
|
|
40
40
|
log = "0.4"
|
|
41
|
-
lzma-rust2 = { version = "0.16.
|
|
41
|
+
lzma-rust2 = { version = "0.16.4" }
|
|
42
42
|
memmap2 = "0.9"
|
|
43
43
|
minijinja = "2"
|
|
44
44
|
num_cpus = "1.17.0"
|
|
45
45
|
once_cell = "1.21.4"
|
|
46
46
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
47
47
|
parking_lot = "0.12.5"
|
|
48
|
-
pdf_oxide = { version = "0.3.37", default-features = false }
|
|
49
48
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
50
49
|
rayon = "1.12.0"
|
|
51
|
-
reqwest = { version = "0.13.
|
|
50
|
+
reqwest = { version = "0.13.4", default-features = false }
|
|
52
51
|
serde = { version = "1.0.228", features = ["derive"] }
|
|
53
|
-
serde_json = { version = "1.0.
|
|
52
|
+
serde_json = { version = "1.0.150" }
|
|
54
53
|
serde_toon_format = "0.1"
|
|
55
54
|
tempfile = "3.27.0"
|
|
56
55
|
thiserror = "2.0.18"
|
|
57
|
-
tokio = { version = "1.52.
|
|
56
|
+
tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
58
57
|
toml = "1.1.2"
|
|
59
58
|
tracing = "0.1"
|
|
60
|
-
tree-sitter-language-pack = { version = "1.
|
|
59
|
+
tree-sitter-language-pack = { version = "1.8.1", features = ["serde"], default-features = false }
|
|
61
60
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
|
62
61
|
wasm-bindgen-futures = "0.4"
|
|
63
62
|
web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.9"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -39,10 +39,6 @@ simd-utf8 = ["dep:simdutf8"]
|
|
|
39
39
|
tokio-runtime = ["dep:tokio"]
|
|
40
40
|
|
|
41
41
|
pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image", "dep:flate2", "html"]
|
|
42
|
-
# Experimental: use pdf_oxide for text extraction (pure Rust, no C++ deps).
|
|
43
|
-
# Provides cleaner word spacing for PDFs with broken font CMaps.
|
|
44
|
-
# Requires 'pdf' feature. Not included in 'full' — opt-in only.
|
|
45
|
-
pdf-oxide = ["pdf", "dep:pdf_oxide"]
|
|
46
42
|
static-pdfium = ["pdf"]
|
|
47
43
|
bundled-pdfium = ["pdf"]
|
|
48
44
|
system-pdfium = ["pdf"]
|
|
@@ -61,7 +57,14 @@ office = [
|
|
|
61
57
|
]
|
|
62
58
|
hwp = ["dep:cfb", "dep:flate2"]
|
|
63
59
|
iwork = ["dep:zip", "dep:snap"]
|
|
64
|
-
email = [
|
|
60
|
+
email = [
|
|
61
|
+
"dep:mail-parser",
|
|
62
|
+
"dep:cfb",
|
|
63
|
+
"dep:outlook-pst",
|
|
64
|
+
"dep:tempfile",
|
|
65
|
+
"dep:chrono",
|
|
66
|
+
"dep:chardetng",
|
|
67
|
+
]
|
|
65
68
|
html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"]
|
|
66
69
|
xml = ["dep:quick-xml", "dep:roxmltree"]
|
|
67
70
|
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
|
|
@@ -254,12 +257,12 @@ biblib = { version = "0.4", default-features = false, features = [
|
|
|
254
257
|
bitvec = "1.0"
|
|
255
258
|
blake3 = "1"
|
|
256
259
|
bytes = { version = "1", features = ["serde"] }
|
|
257
|
-
calamine = { version = "0.
|
|
260
|
+
calamine = { version = "0.35.0", features = ["dates"], optional = true }
|
|
258
261
|
cfb = { version = "0.14", optional = true }
|
|
259
262
|
chardetng = { version = "1.0.0", optional = true }
|
|
260
263
|
chrono = { version = "0.4", optional = true }
|
|
261
264
|
comrak = { version = "0.52", default-features = false }
|
|
262
|
-
dashmap = "6.
|
|
265
|
+
dashmap = "6.2"
|
|
263
266
|
dbase = { version = "0.7", optional = true }
|
|
264
267
|
dirs = "6"
|
|
265
268
|
encoding_rs = { version = "0.8.35" }
|
|
@@ -271,7 +274,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
|
|
271
274
|
"simd",
|
|
272
275
|
], optional = true }
|
|
273
276
|
hex = "0.4.3"
|
|
274
|
-
html-to-markdown-rs = { version = "3.
|
|
277
|
+
html-to-markdown-rs = { version = "3.5.7", default-features = false, features = [
|
|
275
278
|
"inline-images",
|
|
276
279
|
"metadata",
|
|
277
280
|
], optional = true }
|
|
@@ -291,20 +294,20 @@ jotdown = "0.10"
|
|
|
291
294
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
292
295
|
|
|
293
296
|
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
|
|
294
|
-
libc = "0.2.
|
|
295
|
-
liter-llm = { version = "1.
|
|
297
|
+
libc = "0.2.186"
|
|
298
|
+
liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false, optional = true }
|
|
296
299
|
log = "0.4"
|
|
297
|
-
lopdf = { version = "0.
|
|
298
|
-
mail-parser = { version = "0.11.
|
|
299
|
-
memchr = "2.8.
|
|
300
|
+
lopdf = { version = "0.41.0", optional = true }
|
|
301
|
+
mail-parser = { version = "0.11.3", optional = true }
|
|
302
|
+
memchr = "2.8.1"
|
|
300
303
|
memmap2 = "0.9"
|
|
301
304
|
mime_guess = "2.0"
|
|
302
305
|
minijinja = { version = "2", optional = true }
|
|
303
306
|
ndarray = { version = "0.17", optional = true }
|
|
304
307
|
num_cpus = "1.17.0"
|
|
305
308
|
once_cell = "1.21.4"
|
|
306
|
-
opentelemetry = { version = "0.
|
|
307
|
-
opentelemetry_sdk = { version = "0.
|
|
309
|
+
opentelemetry = { version = "0.32", features = ["trace"], optional = true }
|
|
310
|
+
opentelemetry_sdk = { version = "0.32", features = ["rt-tokio"], optional = true }
|
|
308
311
|
org = { version = "0.3", optional = true }
|
|
309
312
|
ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
310
313
|
"std",
|
|
@@ -314,14 +317,13 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
314
317
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
315
318
|
parking_lot = "0.12.5"
|
|
316
319
|
pastey = "0.2"
|
|
317
|
-
pdf_oxide = { version = "0.3.37", default-features = false, optional = true }
|
|
318
320
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
319
321
|
pulldown-cmark = { version = "0.13" }
|
|
320
|
-
quick-xml = { version = "0.
|
|
322
|
+
quick-xml = { version = "0.40.1", features = ["serialize"], optional = true }
|
|
321
323
|
rake = { version = "0.3.6", optional = true }
|
|
322
324
|
rayon = "1.12.0"
|
|
323
325
|
regex = "1.12.3"
|
|
324
|
-
rmcp = { version = "1.
|
|
326
|
+
rmcp = { version = "1.7.0", features = [
|
|
325
327
|
"server",
|
|
326
328
|
"macros",
|
|
327
329
|
"base64",
|
|
@@ -333,32 +335,36 @@ rmp-serde = "1.3"
|
|
|
333
335
|
|
|
334
336
|
roxmltree = { version = "0.21.1", optional = true }
|
|
335
337
|
serde = { version = "1.0.228", features = ["derive"] }
|
|
336
|
-
serde_json = { version = "1.0.
|
|
338
|
+
serde_json = { version = "1.0.150" }
|
|
337
339
|
serde_toon_format = "0.1"
|
|
338
340
|
serde_yaml_ng = "0.10.0"
|
|
339
341
|
sevenz-rust2 = { version = "0.20.2", optional = true }
|
|
340
342
|
sha2 = { version = "0.11", optional = true }
|
|
341
343
|
simdutf8 = { version = "0.1", optional = true }
|
|
342
344
|
snap = { version = "1.1", optional = true }
|
|
343
|
-
tar = { version = "0.4.
|
|
345
|
+
tar = { version = "0.4.46", optional = true }
|
|
344
346
|
tempfile = { version = "3.27.0", optional = true }
|
|
345
|
-
text-splitter = { version = "0.
|
|
347
|
+
text-splitter = { version = "0.31.0", features = ["markdown"], optional = true }
|
|
346
348
|
thiserror = "2.0.18"
|
|
347
349
|
tiff = { version = "0.11", optional = true }
|
|
348
|
-
|
|
350
|
+
# Keep aligned with text-splitter's optional tokenizers integration so ChunkSizer
|
|
351
|
+
# is implemented for the same Tokenizer type used by Kreuzberg.
|
|
352
|
+
tokenizers = { version = "0.23.1", optional = true, default-features = false, features = [
|
|
349
353
|
"http",
|
|
350
354
|
"fancy-regex",
|
|
351
355
|
] }
|
|
352
|
-
tokio = { version = "1.52.
|
|
356
|
+
tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
|
|
353
357
|
toml = "1.1.2"
|
|
354
358
|
tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
|
|
355
359
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
|
|
356
360
|
tracing = "0.1"
|
|
357
|
-
tracing-opentelemetry = { version = "0.
|
|
361
|
+
tracing-opentelemetry = { version = "0.33", optional = true }
|
|
358
362
|
unicode-normalization = { version = "0.1.25", optional = true }
|
|
359
363
|
urlencoding = "2"
|
|
360
|
-
utoipa = { version = "5.
|
|
361
|
-
|
|
364
|
+
utoipa = { version = "5.5", features = ["axum_extras"], optional = true }
|
|
365
|
+
# Pinned to 0.15 — v_htmlescape 0.17 renamed `escape` fn to an `Escape` struct.
|
|
366
|
+
# Update call sites in src/rendering/html_styled.rs before bumping.
|
|
367
|
+
v_htmlescape = { version = "=0.15.8", optional = true }
|
|
362
368
|
whatlang = { version = "0.18.0", optional = true }
|
|
363
369
|
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
|
|
364
370
|
"deflate-flate2",
|
|
@@ -392,7 +398,7 @@ optional = true
|
|
|
392
398
|
# Override getrandom to enable js feature for WASM targets
|
|
393
399
|
# This is needed because ring/rustls (via ureq) depend on getrandom without js feature
|
|
394
400
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
395
|
-
tree-sitter-language-pack = { version = "1.
|
|
401
|
+
tree-sitter-language-pack = { version = "1.8.1", features = ["serde"], default-features = false, optional = true }
|
|
396
402
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
|
397
403
|
|
|
398
404
|
[build-dependencies]
|
|
@@ -406,10 +412,10 @@ dotenvy = "0.15"
|
|
|
406
412
|
filetime = "0.2"
|
|
407
413
|
image = { version = "0.25.10", default-features = false, features = ["png"] }
|
|
408
414
|
jsonschema = "0.46"
|
|
409
|
-
serial_test = "3.
|
|
410
|
-
tar = "0.4.
|
|
415
|
+
serial_test = "3.5.0"
|
|
416
|
+
tar = "0.4.46"
|
|
411
417
|
tempfile = "3.27.0"
|
|
412
|
-
tokio = { version = "1.52.
|
|
418
|
+
tokio = { version = "1.52.3", features = ["macros", "time"] }
|
|
413
419
|
tokio-test = "0.4"
|
|
414
420
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
415
421
|
zip = { version = ">=7.0.0, <8.6.0", default-features = false, features = ["deflate-flate2"] }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.9.
|
|
21
|
+
> **🚀 Version 4.9.9 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -8,17 +8,14 @@ use serde::{Deserialize, Serialize};
|
|
|
8
8
|
/// PDF extraction backend selection.
|
|
9
9
|
///
|
|
10
10
|
/// Controls which PDF library is used for text extraction:
|
|
11
|
-
/// - `Pdfium`: pdfium-render (default,
|
|
12
|
-
/// - `
|
|
13
|
-
/// - `Auto`: automatically select based on available features
|
|
11
|
+
/// - `Pdfium`: pdfium-render (default, mature)
|
|
12
|
+
/// - `Auto`: automatically select the default available backend
|
|
14
13
|
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
|
|
15
14
|
#[serde(rename_all = "lowercase")]
|
|
16
15
|
pub enum PdfBackend {
|
|
17
16
|
/// Use pdfium-render backend (default).
|
|
18
17
|
#[default]
|
|
19
18
|
Pdfium,
|
|
20
|
-
/// Use pdf_oxide backend (pure Rust). Requires `pdf-oxide` feature.
|
|
21
|
-
PdfOxide,
|
|
22
19
|
/// Automatically select the best available backend.
|
|
23
20
|
Auto,
|
|
24
21
|
}
|
|
@@ -128,7 +128,12 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
128
128
|
|
|
129
129
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
130
130
|
let result = {
|
|
131
|
-
|
|
131
|
+
if config.extraction_timeout_secs.is_some() {
|
|
132
|
+
return Err(crate::KreuzbergError::Validation {
|
|
133
|
+
message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
|
|
134
|
+
source: None,
|
|
135
|
+
});
|
|
136
|
+
}
|
|
132
137
|
extraction_future.await
|
|
133
138
|
};
|
|
134
139
|
|
|
@@ -142,7 +142,12 @@ pub async fn extract_file(
|
|
|
142
142
|
|
|
143
143
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
144
144
|
let result = {
|
|
145
|
-
|
|
145
|
+
if config.extraction_timeout_secs.is_some() {
|
|
146
|
+
return Err(crate::KreuzbergError::Validation {
|
|
147
|
+
message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
|
|
148
|
+
source: None,
|
|
149
|
+
});
|
|
150
|
+
}
|
|
146
151
|
extraction_future.await
|
|
147
152
|
};
|
|
148
153
|
|
|
@@ -35,6 +35,13 @@ pub(super) fn extract_bytes_sync_impl(
|
|
|
35
35
|
let cfg = config.cloned().unwrap_or_default();
|
|
36
36
|
let cfg = cfg.normalized().into_owned();
|
|
37
37
|
|
|
38
|
+
if cfg.extraction_timeout_secs.is_some() {
|
|
39
|
+
return Err(crate::KreuzbergError::Validation {
|
|
40
|
+
message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
|
|
41
|
+
source: None,
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
|
|
38
45
|
let validated_mime = if let Some(mime) = mime_type {
|
|
39
46
|
if mime == "application/octet-stream" {
|
|
40
47
|
mime::detect_mime_type_from_bytes(content)?
|
|
@@ -37,10 +37,19 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
|
|
|
37
37
|
continue;
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
|
|
40
|
+
let normalized: String = page
|
|
41
|
+
.content
|
|
42
|
+
.split("\n\n")
|
|
43
|
+
.map(str::trim)
|
|
44
|
+
.filter(|s| !s.is_empty())
|
|
45
|
+
.collect::<Vec<_>>()
|
|
46
|
+
.join("\n\n");
|
|
47
|
+
|
|
48
|
+
// Try normalized exact match first. PDF page text can contain trailing
|
|
49
|
+
// spaces that render_plain strips before chunking.
|
|
50
|
+
if let Some(pos) = content[search_offset..].find(normalized.as_str()) {
|
|
42
51
|
let byte_start = search_offset + pos;
|
|
43
|
-
let byte_end = content.floor_char_boundary(byte_start +
|
|
52
|
+
let byte_end = content.floor_char_boundary(byte_start + normalized.len());
|
|
44
53
|
boundaries.push(PageBoundary {
|
|
45
54
|
page_number: page.page_number,
|
|
46
55
|
byte_start,
|
|
@@ -50,12 +59,12 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
|
|
|
50
59
|
continue;
|
|
51
60
|
}
|
|
52
61
|
|
|
53
|
-
// Fallback: search for first non-empty line of page content
|
|
62
|
+
// Fallback: search for first non-empty line of page content.
|
|
54
63
|
if let Some(line) = page.content.lines().find(|l| !l.trim().is_empty()).map(|l| l.trim())
|
|
55
64
|
&& let Some(pos) = content[search_offset..].find(line)
|
|
56
65
|
{
|
|
57
66
|
let byte_start = search_offset + pos;
|
|
58
|
-
let raw_end = (byte_start +
|
|
67
|
+
let raw_end = (byte_start + normalized.len()).min(content.len());
|
|
59
68
|
let byte_end = content.floor_char_boundary(raw_end);
|
|
60
69
|
boundaries.push(PageBoundary {
|
|
61
70
|
page_number: page.page_number,
|
|
@@ -176,25 +185,27 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
|
|
|
176
185
|
let resolved_config = chunking_config.resolve_preset();
|
|
177
186
|
let chunking_config = &resolved_config;
|
|
178
187
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
188
|
+
let (chunk_input, heading_source) = if config.output_format != crate::core::config::OutputFormat::Plain {
|
|
189
|
+
(
|
|
190
|
+
result.formatted_content.as_deref().unwrap_or(result.content.as_str()),
|
|
191
|
+
None,
|
|
192
|
+
)
|
|
193
|
+
} else {
|
|
194
|
+
(result.content.as_str(), result.formatted_content.as_deref())
|
|
195
|
+
};
|
|
196
|
+
|
|
183
197
|
let recomputed_boundaries: Option<Vec<PageBoundary>> = result
|
|
184
198
|
.pages
|
|
185
199
|
.as_deref()
|
|
186
|
-
.map(|pages| recompute_boundaries_from_pages(
|
|
200
|
+
.map(|pages| recompute_boundaries_from_pages(chunk_input, pages))
|
|
201
|
+
.filter(|boundaries| !boundaries.is_empty());
|
|
187
202
|
|
|
188
203
|
let page_boundaries: Option<&[PageBoundary]> = recomputed_boundaries
|
|
189
204
|
.as_deref()
|
|
190
205
|
.or_else(|| result.metadata.pages.as_ref().and_then(|ps| ps.boundaries.as_deref()));
|
|
191
206
|
|
|
192
|
-
// Pass formatted_content (markdown) for heading context resolution when available.
|
|
193
|
-
// Plain-text rendering strips heading markers, but the markdown chunker needs them
|
|
194
|
-
// to build the heading hierarchy for chunk metadata.
|
|
195
|
-
let heading_source = result.formatted_content.as_deref();
|
|
196
207
|
match crate::chunking::chunk_text_with_heading_source(
|
|
197
|
-
|
|
208
|
+
chunk_input,
|
|
198
209
|
chunking_config,
|
|
199
210
|
page_boundaries,
|
|
200
211
|
heading_source,
|
|
@@ -314,3 +325,92 @@ pub(super) fn execute_token_reduction(result: &mut ExtractionResult, config: &Ex
|
|
|
314
325
|
|
|
315
326
|
Ok(())
|
|
316
327
|
}
|
|
328
|
+
|
|
329
|
+
#[cfg(test)]
|
|
330
|
+
#[cfg(feature = "chunking")]
|
|
331
|
+
mod tests {
|
|
332
|
+
use super::*;
|
|
333
|
+
use crate::core::config::{ChunkerType, ChunkingConfig, OutputFormat};
|
|
334
|
+
use crate::types::PageContent;
|
|
335
|
+
|
|
336
|
+
fn make_page(page_number: usize, content: &str) -> PageContent {
|
|
337
|
+
PageContent {
|
|
338
|
+
page_number,
|
|
339
|
+
content: content.to_string(),
|
|
340
|
+
tables: Vec::new(),
|
|
341
|
+
images: Vec::new(),
|
|
342
|
+
hierarchy: None,
|
|
343
|
+
is_blank: None,
|
|
344
|
+
layout_regions: None,
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
fn markdown_chunking_config() -> ExtractionConfig {
|
|
349
|
+
ExtractionConfig {
|
|
350
|
+
output_format: OutputFormat::Markdown,
|
|
351
|
+
chunking: Some(ChunkingConfig {
|
|
352
|
+
max_characters: 2000,
|
|
353
|
+
overlap: 0,
|
|
354
|
+
trim: true,
|
|
355
|
+
chunker_type: ChunkerType::Markdown,
|
|
356
|
+
..Default::default()
|
|
357
|
+
}),
|
|
358
|
+
..Default::default()
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
#[test]
|
|
363
|
+
fn chunks_content_is_markdown_when_output_format_is_markdown() {
|
|
364
|
+
let mut result = ExtractionResult {
|
|
365
|
+
content: "SH-001 Luca Bianchi Common Germany 3500000".to_string(),
|
|
366
|
+
formatted_content: Some("| SH-001 | Luca Bianchi | Common | Germany | 3,500,000 |".to_string()),
|
|
367
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
368
|
+
..Default::default()
|
|
369
|
+
};
|
|
370
|
+
|
|
371
|
+
execute_chunking(&mut result, &markdown_chunking_config()).unwrap();
|
|
372
|
+
|
|
373
|
+
let chunks = result.chunks.expect("chunks must be populated");
|
|
374
|
+
assert!(!chunks.is_empty());
|
|
375
|
+
assert!(chunks.iter().any(|chunk| chunk.content.contains('|')));
|
|
376
|
+
assert!(chunks.iter().all(|chunk| !chunk.content.starts_with("SH-001 Luca")));
|
|
377
|
+
assert!(result.formatted_content.is_some());
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
#[test]
|
|
381
|
+
fn markdown_chunks_preserve_page_metadata_when_formatted_pages_match() {
|
|
382
|
+
let mut result = ExtractionResult {
|
|
383
|
+
content: "Page one text\n\nPage two text".to_string(),
|
|
384
|
+
formatted_content: Some("# Page one\n\nPage one text\n\n# Page two\n\nPage two text".to_string()),
|
|
385
|
+
pages: Some(vec![make_page(1, "Page one text"), make_page(2, "Page two text")]),
|
|
386
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
387
|
+
..Default::default()
|
|
388
|
+
};
|
|
389
|
+
|
|
390
|
+
execute_chunking(&mut result, &markdown_chunking_config()).unwrap();
|
|
391
|
+
|
|
392
|
+
let chunks = result.chunks.expect("chunks must be populated");
|
|
393
|
+
assert!(!chunks.is_empty());
|
|
394
|
+
assert!(chunks.iter().any(|chunk| chunk.metadata.first_page.is_some()));
|
|
395
|
+
assert!(chunks.iter().any(|chunk| chunk.metadata.last_page.is_some()));
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
#[test]
|
|
399
|
+
fn recompute_boundaries_trailing_space_pages_all_resolve() {
|
|
400
|
+
let p1_raw = "Heading \n\nBody paragraph one. ";
|
|
401
|
+
let p2_raw = "Second heading \n\nBody paragraph two. ";
|
|
402
|
+
let p3_raw = "Conclusion. ";
|
|
403
|
+
let p1_norm = "Heading\n\nBody paragraph one.";
|
|
404
|
+
let p2_norm = "Second heading\n\nBody paragraph two.";
|
|
405
|
+
let p3_norm = "Conclusion.";
|
|
406
|
+
let content = format!("{p1_norm}\n\n{p2_norm}\n\n{p3_norm}");
|
|
407
|
+
|
|
408
|
+
let pages = vec![make_page(1, p1_raw), make_page(2, p2_raw), make_page(3, p3_raw)];
|
|
409
|
+
let boundaries = recompute_boundaries_from_pages(&content, &pages);
|
|
410
|
+
|
|
411
|
+
assert_eq!(boundaries.len(), 3);
|
|
412
|
+
assert_eq!(&content[boundaries[0].byte_start..boundaries[0].byte_end], p1_norm);
|
|
413
|
+
assert_eq!(&content[boundaries[1].byte_start..boundaries[1].byte_end], p2_norm);
|
|
414
|
+
assert_eq!(&content[boundaries[2].byte_start..boundaries[2].byte_end], p3_norm);
|
|
415
|
+
}
|
|
416
|
+
}
|
|
@@ -270,11 +270,13 @@ fn load_tokenizer(
|
|
|
270
270
|
{
|
|
271
271
|
for (_, value) in &map {
|
|
272
272
|
if let Some(content) = value.as_str() {
|
|
273
|
-
tokenizer
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
273
|
+
tokenizer
|
|
274
|
+
.add_special_tokens([AddedToken {
|
|
275
|
+
content: content.to_string(),
|
|
276
|
+
special: true,
|
|
277
|
+
..Default::default()
|
|
278
|
+
}])
|
|
279
|
+
.map_err(|e| crate::KreuzbergError::embedding(format!("Failed to add special token: {e}")))?;
|
|
278
280
|
} else if value.is_object()
|
|
279
281
|
&& let (Some(content), Some(single_word), Some(lstrip), Some(rstrip), Some(normalized)) = (
|
|
280
282
|
value["content"].as_str(),
|
|
@@ -284,14 +286,16 @@ fn load_tokenizer(
|
|
|
284
286
|
value["normalized"].as_bool(),
|
|
285
287
|
)
|
|
286
288
|
{
|
|
287
|
-
tokenizer
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
289
|
+
tokenizer
|
|
290
|
+
.add_special_tokens([AddedToken {
|
|
291
|
+
content: content.to_string(),
|
|
292
|
+
special: true,
|
|
293
|
+
single_word,
|
|
294
|
+
lstrip,
|
|
295
|
+
rstrip,
|
|
296
|
+
normalized,
|
|
297
|
+
}])
|
|
298
|
+
.map_err(|e| crate::KreuzbergError::embedding(format!("Failed to add special token: {e}")))?;
|
|
295
299
|
}
|
|
296
300
|
}
|
|
297
301
|
}
|