kreuzberg 4.9.7 → 4.9.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -4
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +11 -11
- data/vendor/kreuzberg/Cargo.toml +22 -17
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/tree_sitter.rs +0 -1
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +45 -1
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +13 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +62 -11
- data/vendor/kreuzberg/src/llm/structured.rs +22 -17
- data/vendor/kreuzberg/src/llm/vlm_ocr.rs +11 -6
- data/vendor/kreuzberg/src/pdf/oxide_text.rs +1 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +35 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +5 -5
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-ffi/src/lib.rs +1 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +4 -4
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b634075425816167cdb132080aa31a6c5c561badbd154de17896866bdf88ba6e
|
|
4
|
+
data.tar.gz: 4a8d041bad2da842a676b2e2358ba44d3269323e820728bdb568ebc9171adee1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 60c018a882054c23b629ee1d0692493c25c78a52e106fc7f25e3c22d7cdcfd36341f5aaeebea3d6be48d1425550914124daaeaeeb02bbd84bbb64cd6afedad59
|
|
7
|
+
data.tar.gz: 62d942f23dae32120184cd5b011275a2556b3e112a866128bd290a67ddbbd94c523ec2eaecd24589911264609ae612fd22eb51f3efc2e9eec4b1d08f8bd6823c
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.8" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.8"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -49,13 +49,13 @@ kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
|
|
|
49
49
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
50
50
|
"rb-sys",
|
|
51
51
|
] }
|
|
52
|
-
rb-sys = { version = "0.9.
|
|
52
|
+
rb-sys = { version = "0.9.128", default-features = false, features = [
|
|
53
53
|
"stable-api-compiled-fallback",
|
|
54
54
|
] }
|
|
55
55
|
serde_json = "1.0.149"
|
|
56
56
|
toml = "1.1.2"
|
|
57
57
|
serde_yaml_ng = "0.10"
|
|
58
|
-
tokio = { version = "1.52.
|
|
58
|
+
tokio = { version = "1.52.3", features = [
|
|
59
59
|
"rt",
|
|
60
60
|
"rt-multi-thread",
|
|
61
61
|
"macros",
|
|
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
|
|
|
65
65
|
"time",
|
|
66
66
|
"io-util",
|
|
67
67
|
] }
|
|
68
|
-
html-to-markdown-rs = { version = "3.
|
|
68
|
+
html-to-markdown-rs = { version = "3.4.1", default-features = false }
|
|
69
69
|
|
|
70
70
|
[dev-dependencies]
|
|
71
71
|
pretty_assertions = "1.4"
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.9.
|
|
5
|
+
version = "4.9.8"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -23,20 +23,20 @@ clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
|
|
|
23
23
|
comrak = { version = "0.52", default-features = false }
|
|
24
24
|
console_error_panic_hook = "0.1"
|
|
25
25
|
criterion = { version = "0.8", features = ["html_reports"] }
|
|
26
|
-
ctor = "0
|
|
26
|
+
ctor = "1.0"
|
|
27
27
|
dbase = "0.7"
|
|
28
28
|
futures = "0.3"
|
|
29
29
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
30
30
|
hex = "0.4.3"
|
|
31
|
-
html-to-markdown-rs = { version = "3.
|
|
31
|
+
html-to-markdown-rs = { version = "3.4.1", default-features = false }
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.8", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.8" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
|
-
libc = "0.2.
|
|
39
|
-
liter-llm = { version = "1.
|
|
38
|
+
libc = "0.2.186"
|
|
39
|
+
liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false }
|
|
40
40
|
log = "0.4"
|
|
41
41
|
lzma-rust2 = { version = "0.16.2" }
|
|
42
42
|
memmap2 = "0.9"
|
|
@@ -45,19 +45,19 @@ num_cpus = "1.17.0"
|
|
|
45
45
|
once_cell = "1.21.4"
|
|
46
46
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
47
47
|
parking_lot = "0.12.5"
|
|
48
|
-
pdf_oxide = { version = "0.3.
|
|
48
|
+
pdf_oxide = { version = "0.3.49", default-features = false }
|
|
49
49
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
50
50
|
rayon = "1.12.0"
|
|
51
|
-
reqwest = { version = "0.13.
|
|
51
|
+
reqwest = { version = "0.13.3", default-features = false }
|
|
52
52
|
serde = { version = "1.0.228", features = ["derive"] }
|
|
53
53
|
serde_json = { version = "1.0.149" }
|
|
54
54
|
serde_toon_format = "0.1"
|
|
55
55
|
tempfile = "3.27.0"
|
|
56
56
|
thiserror = "2.0.18"
|
|
57
|
-
tokio = { version = "1.52.
|
|
57
|
+
tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
58
58
|
toml = "1.1.2"
|
|
59
59
|
tracing = "0.1"
|
|
60
|
-
tree-sitter-language-pack = { version = "1.
|
|
60
|
+
tree-sitter-language-pack = { version = "1.8.1", features = ["serde"], default-features = false }
|
|
61
61
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
|
62
62
|
wasm-bindgen-futures = "0.4"
|
|
63
63
|
web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.8"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -254,7 +254,7 @@ biblib = { version = "0.4", default-features = false, features = [
|
|
|
254
254
|
bitvec = "1.0"
|
|
255
255
|
blake3 = "1"
|
|
256
256
|
bytes = { version = "1", features = ["serde"] }
|
|
257
|
-
calamine = { version = "0.
|
|
257
|
+
calamine = { version = "0.35.0", features = ["dates"], optional = true }
|
|
258
258
|
cfb = { version = "0.14", optional = true }
|
|
259
259
|
chardetng = { version = "1.0.0", optional = true }
|
|
260
260
|
chrono = { version = "0.4", optional = true }
|
|
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
|
|
271
271
|
"simd",
|
|
272
272
|
], optional = true }
|
|
273
273
|
hex = "0.4.3"
|
|
274
|
-
html-to-markdown-rs = { version = "3.
|
|
274
|
+
html-to-markdown-rs = { version = "3.4.1", default-features = false, features = [
|
|
275
275
|
"inline-images",
|
|
276
276
|
"metadata",
|
|
277
277
|
], optional = true }
|
|
@@ -291,11 +291,11 @@ jotdown = "0.10"
|
|
|
291
291
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
292
292
|
|
|
293
293
|
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
|
|
294
|
-
libc = "0.2.
|
|
295
|
-
liter-llm = { version = "1.
|
|
294
|
+
libc = "0.2.186"
|
|
295
|
+
liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false, optional = true }
|
|
296
296
|
log = "0.4"
|
|
297
297
|
lopdf = { version = "0.40.0", optional = true }
|
|
298
|
-
mail-parser = { version = "0.11.
|
|
298
|
+
mail-parser = { version = "0.11.3", optional = true }
|
|
299
299
|
memchr = "2.8.0"
|
|
300
300
|
memmap2 = "0.9"
|
|
301
301
|
mime_guess = "2.0"
|
|
@@ -303,8 +303,8 @@ minijinja = { version = "2", optional = true }
|
|
|
303
303
|
ndarray = { version = "0.17", optional = true }
|
|
304
304
|
num_cpus = "1.17.0"
|
|
305
305
|
once_cell = "1.21.4"
|
|
306
|
-
opentelemetry = { version = "0.
|
|
307
|
-
opentelemetry_sdk = { version = "0.
|
|
306
|
+
opentelemetry = { version = "0.32", features = ["trace"], optional = true }
|
|
307
|
+
opentelemetry_sdk = { version = "0.32", features = ["rt-tokio"], optional = true }
|
|
308
308
|
org = { version = "0.3", optional = true }
|
|
309
309
|
ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
310
310
|
"std",
|
|
@@ -314,14 +314,14 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
314
314
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
315
315
|
parking_lot = "0.12.5"
|
|
316
316
|
pastey = "0.2"
|
|
317
|
-
pdf_oxide = { version = "0.3.
|
|
317
|
+
pdf_oxide = { version = "0.3.49", default-features = false, optional = true }
|
|
318
318
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
319
319
|
pulldown-cmark = { version = "0.13" }
|
|
320
|
-
quick-xml = { version = "0.
|
|
320
|
+
quick-xml = { version = "0.40.1", features = ["serialize"], optional = true }
|
|
321
321
|
rake = { version = "0.3.6", optional = true }
|
|
322
322
|
rayon = "1.12.0"
|
|
323
323
|
regex = "1.12.3"
|
|
324
|
-
rmcp = { version = "1.
|
|
324
|
+
rmcp = { version = "1.7.0", features = [
|
|
325
325
|
"server",
|
|
326
326
|
"macros",
|
|
327
327
|
"base64",
|
|
@@ -345,11 +345,14 @@ tempfile = { version = "3.27.0", optional = true }
|
|
|
345
345
|
text-splitter = { version = "0.30.1", features = ["markdown"], optional = true }
|
|
346
346
|
thiserror = "2.0.18"
|
|
347
347
|
tiff = { version = "0.11", optional = true }
|
|
348
|
-
|
|
348
|
+
# Pinned to 0.22 — text-splitter 0.30.1 ChunkSizer impl + embeddings/add_special_tokens
|
|
349
|
+
# break against tokenizers 0.23. Bump deliberately on the next minor with a coordinated
|
|
350
|
+
# text-splitter upgrade. Tracked under issue #991 / 4.9.8 release.
|
|
351
|
+
tokenizers = { version = "=0.22.2", optional = true, default-features = false, features = [
|
|
349
352
|
"http",
|
|
350
353
|
"fancy-regex",
|
|
351
354
|
] }
|
|
352
|
-
tokio = { version = "1.52.
|
|
355
|
+
tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
|
|
353
356
|
toml = "1.1.2"
|
|
354
357
|
tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
|
|
355
358
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
|
|
@@ -357,8 +360,10 @@ tracing = "0.1"
|
|
|
357
360
|
tracing-opentelemetry = { version = "0.32", optional = true }
|
|
358
361
|
unicode-normalization = { version = "0.1.25", optional = true }
|
|
359
362
|
urlencoding = "2"
|
|
360
|
-
utoipa = { version = "5.
|
|
361
|
-
|
|
363
|
+
utoipa = { version = "5.5", features = ["axum_extras"], optional = true }
|
|
364
|
+
# Pinned to 0.15 — v_htmlescape 0.17 renamed `escape` fn to an `Escape` struct.
|
|
365
|
+
# Update call sites in src/rendering/html_styled.rs before bumping.
|
|
366
|
+
v_htmlescape = { version = "=0.15.8", optional = true }
|
|
362
367
|
whatlang = { version = "0.18.0", optional = true }
|
|
363
368
|
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
|
|
364
369
|
"deflate-flate2",
|
|
@@ -392,7 +397,7 @@ optional = true
|
|
|
392
397
|
# Override getrandom to enable js feature for WASM targets
|
|
393
398
|
# This is needed because ring/rustls (via ureq) depend on getrandom without js feature
|
|
394
399
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
395
|
-
tree-sitter-language-pack = { version = "1.
|
|
400
|
+
tree-sitter-language-pack = { version = "1.8.1", features = ["serde"], default-features = false, optional = true }
|
|
396
401
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
|
397
402
|
|
|
398
403
|
[build-dependencies]
|
|
@@ -409,7 +414,7 @@ jsonschema = "0.46"
|
|
|
409
414
|
serial_test = "3.4.0"
|
|
410
415
|
tar = "0.4.45"
|
|
411
416
|
tempfile = "3.27.0"
|
|
412
|
-
tokio = { version = "1.52.
|
|
417
|
+
tokio = { version = "1.52.3", features = ["macros", "time"] }
|
|
413
418
|
tokio-test = "0.4"
|
|
414
419
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
415
420
|
zip = { version = ">=7.0.0, <8.6.0", default-features = false, features = ["deflate-flate2"] }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.9.
|
|
21
|
+
> **🚀 Version 4.9.8 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
//! Character encoding utilities for RTF parsing.
|
|
2
2
|
//!
|
|
3
|
-
//! Provides hex byte parsing and Windows
|
|
3
|
+
//! Provides hex byte parsing and legacy Windows codepage decoding for RTF byte escapes.
|
|
4
|
+
|
|
5
|
+
use encoding_rs::Encoding;
|
|
4
6
|
|
|
5
7
|
/// Convert a hex digit character to its numeric value.
|
|
6
8
|
///
|
|
@@ -69,6 +71,48 @@ pub fn decode_windows_1252(byte: u8) -> char {
|
|
|
69
71
|
}
|
|
70
72
|
}
|
|
71
73
|
|
|
74
|
+
/// Map a Windows codepage number to an `encoding_rs` encoding.
|
|
75
|
+
///
|
|
76
|
+
/// Unknown values fall back to Windows-1252, the RTF default ANSI codepage.
|
|
77
|
+
#[inline]
|
|
78
|
+
pub(crate) fn encoding_for_windows_codepage(codepage: u32) -> &'static Encoding {
|
|
79
|
+
let label: &[u8] = match codepage {
|
|
80
|
+
65001 => b"utf-8",
|
|
81
|
+
20127 => b"us-ascii",
|
|
82
|
+
1250 => b"windows-1250",
|
|
83
|
+
1251 => b"windows-1251",
|
|
84
|
+
1252 => b"windows-1252",
|
|
85
|
+
1253 => b"windows-1253",
|
|
86
|
+
1254 => b"windows-1254",
|
|
87
|
+
1255 => b"windows-1255",
|
|
88
|
+
1256 => b"windows-1256",
|
|
89
|
+
1257 => b"windows-1257",
|
|
90
|
+
1258 => b"windows-1258",
|
|
91
|
+
932 | 10001 => b"shift_jis",
|
|
92
|
+
936 | 10008 => b"gbk",
|
|
93
|
+
949 | 10003 => b"euc-kr",
|
|
94
|
+
950 | 10002 => b"big5",
|
|
95
|
+
28591 => b"iso-8859-1",
|
|
96
|
+
28592 => b"iso-8859-2",
|
|
97
|
+
28595 => b"iso-8859-5",
|
|
98
|
+
28597 => b"iso-8859-7",
|
|
99
|
+
28599 => b"iso-8859-9",
|
|
100
|
+
_ => b"windows-1252",
|
|
101
|
+
};
|
|
102
|
+
Encoding::for_label(label).unwrap_or(encoding_rs::WINDOWS_1252)
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/// Decode RTF hex escape bytes using the active ANSI codepage.
|
|
106
|
+
#[inline]
|
|
107
|
+
pub(crate) fn decode_ansi_bytes(bytes: &[u8], codepage: u32) -> String {
|
|
108
|
+
if codepage == 1252 {
|
|
109
|
+
return bytes.iter().map(|&byte| decode_windows_1252(byte)).collect();
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
let (decoded, _, _) = encoding_for_windows_codepage(codepage).decode(bytes);
|
|
113
|
+
decoded.into_owned()
|
|
114
|
+
}
|
|
115
|
+
|
|
72
116
|
/// Parse an RTF control word and extract its value.
|
|
73
117
|
///
|
|
74
118
|
/// Returns a tuple of (control_word, optional_numeric_value).
|
|
@@ -486,6 +486,19 @@ mod tests {
|
|
|
486
486
|
);
|
|
487
487
|
}
|
|
488
488
|
|
|
489
|
+
#[test]
|
|
490
|
+
fn test_rtf_ansicpg1251_hex_escape_extraction() {
|
|
491
|
+
let rtf_content =
|
|
492
|
+
r#"{\rtf1\ansi\ansicpg1251\deff0{\fonttbl{\f0\fnil\fcharset204 Arial;}}\f0 \'cf\'f0\'e8\'e2\'e5\'f2}"#;
|
|
493
|
+
let (text, _, _, _, _) = extract_text_from_rtf(rtf_content, true);
|
|
494
|
+
|
|
495
|
+
assert!(text.contains("Привет"), "expected readable Cyrillic, got: {text:?}");
|
|
496
|
+
assert!(
|
|
497
|
+
!text.contains("Ïðèâåò"),
|
|
498
|
+
"should not decode CP1251 bytes as Windows-1252"
|
|
499
|
+
);
|
|
500
|
+
}
|
|
501
|
+
|
|
489
502
|
#[tokio::test]
|
|
490
503
|
async fn test_rtf_document_structure_with_annotations() {
|
|
491
504
|
let rtf_content = r#"{\rtf1 Normal text\par {\b Bold paragraph}\par More normal text}"#;
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
//! Core RTF parsing logic.
|
|
2
2
|
|
|
3
|
-
use crate::extractors::rtf::encoding::{
|
|
3
|
+
use crate::extractors::rtf::encoding::{decode_ansi_bytes, parse_hex_byte, parse_rtf_control_word};
|
|
4
4
|
use crate::extractors::rtf::formatting::{map_offset, normalize_whitespace_with_mapping};
|
|
5
5
|
use crate::extractors::rtf::images::{RtfImage, extract_pict_image};
|
|
6
6
|
use crate::extractors::rtf::tables::TableState;
|
|
@@ -944,6 +944,9 @@ pub fn extract_text_from_rtf(
|
|
|
944
944
|
// Unicode skip count (\ucN): how many replacement bytes follow \uN.
|
|
945
945
|
// Scoped per group — push on '{', pop on '}'.
|
|
946
946
|
let mut uc_stack: Vec<u8> = vec![1]; // default \uc1
|
|
947
|
+
// ANSI codepage for \'hh escapes. RTF defaults to Windows-1252 unless
|
|
948
|
+
// overridden by \ansicpgNNNN. Scoped like other document properties.
|
|
949
|
+
let mut ansi_codepage_stack: Vec<u32> = vec![1252];
|
|
947
950
|
|
|
948
951
|
// Hyperlink field tracking for \field{\*\fldinst HYPERLINK "url"}{\fldrslt text}
|
|
949
952
|
let mut in_fldinst = false;
|
|
@@ -1009,6 +1012,8 @@ pub fn extract_text_from_rtf(
|
|
|
1009
1012
|
// Inherit current uc value into new group scope
|
|
1010
1013
|
let current_uc = uc_stack.last().copied().unwrap_or(1);
|
|
1011
1014
|
uc_stack.push(current_uc);
|
|
1015
|
+
let current_codepage = ansi_codepage_stack.last().copied().unwrap_or(1252);
|
|
1016
|
+
ansi_codepage_stack.push(current_codepage);
|
|
1012
1017
|
// Inherit hidden state into new group scope
|
|
1013
1018
|
let current_hidden = hidden_stack.last().copied().unwrap_or(false);
|
|
1014
1019
|
hidden_stack.push(current_hidden);
|
|
@@ -1028,6 +1033,9 @@ pub fn extract_text_from_rtf(
|
|
|
1028
1033
|
if uc_stack.len() > 1 {
|
|
1029
1034
|
uc_stack.pop();
|
|
1030
1035
|
}
|
|
1036
|
+
if ansi_codepage_stack.len() > 1 {
|
|
1037
|
+
ansi_codepage_stack.pop();
|
|
1038
|
+
}
|
|
1031
1039
|
if hidden_stack.len() > 1 {
|
|
1032
1040
|
hidden_stack.pop();
|
|
1033
1041
|
}
|
|
@@ -1143,6 +1151,7 @@ pub fn extract_text_from_rtf(
|
|
|
1143
1151
|
&mut para_metas,
|
|
1144
1152
|
&mut para_meta_emitted,
|
|
1145
1153
|
&mut uc_stack,
|
|
1154
|
+
&mut ansi_codepage_stack,
|
|
1146
1155
|
&mut footnote_count,
|
|
1147
1156
|
in_footnote,
|
|
1148
1157
|
&mut footnote_buf,
|
|
@@ -1188,12 +1197,21 @@ pub fn extract_text_from_rtf(
|
|
|
1188
1197
|
expect_destination = false;
|
|
1189
1198
|
let hex1 = chars.next();
|
|
1190
1199
|
let hex2 = chars.next();
|
|
1191
|
-
|
|
1192
|
-
if in_footnote
|
|
1193
|
-
&& let (Some(h1), Some(h2)) = (hex1, hex2)
|
|
1200
|
+
let bytes = if let (Some(h1), Some(h2)) = (hex1, hex2)
|
|
1194
1201
|
&& let Some(byte) = parse_hex_byte(h1, h2)
|
|
1195
1202
|
{
|
|
1196
|
-
|
|
1203
|
+
let mut bytes = vec![byte];
|
|
1204
|
+
while let Some(next_bytes) = consume_adjacent_hex_escape(&mut chars) {
|
|
1205
|
+
bytes.push(next_bytes);
|
|
1206
|
+
}
|
|
1207
|
+
Some(bytes)
|
|
1208
|
+
} else {
|
|
1209
|
+
None
|
|
1210
|
+
};
|
|
1211
|
+
|
|
1212
|
+
if in_footnote && let Some(bytes) = bytes.as_deref() {
|
|
1213
|
+
let codepage = ansi_codepage_stack.last().copied().unwrap_or(1252);
|
|
1214
|
+
footnote_buf.push_str(&decode_ansi_bytes(bytes, codepage));
|
|
1197
1215
|
}
|
|
1198
1216
|
if skip_depth > 0 {
|
|
1199
1217
|
continue;
|
|
@@ -1202,14 +1220,13 @@ pub fn extract_text_from_rtf(
|
|
|
1202
1220
|
if hidden_stack.last().copied().unwrap_or(false) {
|
|
1203
1221
|
continue;
|
|
1204
1222
|
}
|
|
1205
|
-
if let
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
let decoded = decode_windows_1252(byte);
|
|
1223
|
+
if let Some(bytes) = bytes.as_deref() {
|
|
1224
|
+
let codepage = ansi_codepage_stack.last().copied().unwrap_or(1252);
|
|
1225
|
+
let decoded = decode_ansi_bytes(bytes, codepage);
|
|
1209
1226
|
if let Some(state) = table_state.as_mut()
|
|
1210
1227
|
&& state.in_row
|
|
1211
1228
|
{
|
|
1212
|
-
state.current_cell.
|
|
1229
|
+
state.current_cell.push_str(&decoded);
|
|
1213
1230
|
} else {
|
|
1214
1231
|
// Flush deferred boundary space
|
|
1215
1232
|
if pending_boundary_space
|
|
@@ -1221,7 +1238,7 @@ pub fn extract_text_from_rtf(
|
|
|
1221
1238
|
}
|
|
1222
1239
|
pending_boundary_space = false;
|
|
1223
1240
|
para_meta_emitted = false;
|
|
1224
|
-
result.
|
|
1241
|
+
result.push_str(&decoded);
|
|
1225
1242
|
if let Some(flag) = group_has_text.last_mut() {
|
|
1226
1243
|
*flag = true;
|
|
1227
1244
|
}
|
|
@@ -1331,6 +1348,13 @@ pub fn extract_text_from_rtf(
|
|
|
1331
1348
|
{
|
|
1332
1349
|
*uc = val.max(0) as u8;
|
|
1333
1350
|
}
|
|
1351
|
+
if control_word == "ansicpg"
|
|
1352
|
+
&& let Some(val) = _param
|
|
1353
|
+
&& val > 0
|
|
1354
|
+
&& let Some(codepage) = ansi_codepage_stack.last_mut()
|
|
1355
|
+
{
|
|
1356
|
+
*codepage = val as u32;
|
|
1357
|
+
}
|
|
1334
1358
|
// Capture unicode chars inside footnote buffers
|
|
1335
1359
|
if in_footnote
|
|
1336
1360
|
&& control_word == "u"
|
|
@@ -1382,6 +1406,7 @@ pub fn extract_text_from_rtf(
|
|
|
1382
1406
|
&mut para_metas,
|
|
1383
1407
|
&mut para_meta_emitted,
|
|
1384
1408
|
&mut uc_stack,
|
|
1409
|
+
&mut ansi_codepage_stack,
|
|
1385
1410
|
&mut footnote_count,
|
|
1386
1411
|
in_footnote,
|
|
1387
1412
|
&mut footnote_buf,
|
|
@@ -1528,6 +1553,23 @@ pub fn extract_text_from_rtf(
|
|
|
1528
1553
|
(final_result, tables, images, para_metas, formatting_data)
|
|
1529
1554
|
}
|
|
1530
1555
|
|
|
1556
|
+
fn consume_adjacent_hex_escape(chars: &mut std::iter::Peekable<std::str::Chars>) -> Option<u8> {
|
|
1557
|
+
let mut lookahead = chars.clone();
|
|
1558
|
+
if lookahead.next()? != '\\' || lookahead.next()? != '\'' {
|
|
1559
|
+
return None;
|
|
1560
|
+
}
|
|
1561
|
+
let h1 = lookahead.next()?;
|
|
1562
|
+
let h2 = lookahead.next()?;
|
|
1563
|
+
let byte = parse_hex_byte(h1, h2)?;
|
|
1564
|
+
|
|
1565
|
+
chars.next();
|
|
1566
|
+
chars.next();
|
|
1567
|
+
chars.next();
|
|
1568
|
+
chars.next();
|
|
1569
|
+
|
|
1570
|
+
Some(byte)
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1531
1573
|
/// Handle an RTF control word during parsing.
|
|
1532
1574
|
#[allow(clippy::too_many_arguments, clippy::ptr_arg)]
|
|
1533
1575
|
fn handle_control_word(
|
|
@@ -1549,6 +1591,7 @@ fn handle_control_word(
|
|
|
1549
1591
|
para_metas: &mut Vec<ParagraphMeta>,
|
|
1550
1592
|
para_meta_emitted: &mut bool,
|
|
1551
1593
|
uc_stack: &mut Vec<u8>,
|
|
1594
|
+
ansi_codepage_stack: &mut [u32],
|
|
1552
1595
|
footnote_count: &mut usize,
|
|
1553
1596
|
_in_footnote: bool,
|
|
1554
1597
|
_footnote_buf: &mut String,
|
|
@@ -1617,6 +1660,14 @@ fn handle_control_word(
|
|
|
1617
1660
|
*uc = val.max(0) as u8;
|
|
1618
1661
|
}
|
|
1619
1662
|
}
|
|
1663
|
+
"ansicpg" => {
|
|
1664
|
+
if let Some(val) = param
|
|
1665
|
+
&& val > 0
|
|
1666
|
+
&& let Some(codepage) = ansi_codepage_stack.last_mut()
|
|
1667
|
+
{
|
|
1668
|
+
*codepage = val as u32;
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1620
1671
|
// Unicode escape: \u1234 (signed integer)
|
|
1621
1672
|
"u" => {
|
|
1622
1673
|
if let Some(code_num) = param {
|
|
@@ -92,23 +92,28 @@ pub async fn extract_structured(
|
|
|
92
92
|
let sanitized_schema = sanitize_schema_for_provider(&config.schema, &config.llm.model);
|
|
93
93
|
|
|
94
94
|
// Build chat request with JSON schema response format.
|
|
95
|
-
// Use field assignment because `stream` is pub(crate) in liter-llm
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
request
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
95
|
+
// Use field assignment because `stream` is pub(crate) in liter-llm; struct-init
|
|
96
|
+
// syntax with `..Default::default()` won't compile across the crate boundary.
|
|
97
|
+
#[allow(clippy::field_reassign_with_default)]
|
|
98
|
+
let request = {
|
|
99
|
+
let mut req = liter_llm::ChatCompletionRequest::default();
|
|
100
|
+
req.model = config.llm.model.clone();
|
|
101
|
+
req.messages = vec![liter_llm::Message::User(liter_llm::UserMessage {
|
|
102
|
+
content: liter_llm::UserContent::Text(prompt),
|
|
103
|
+
name: None,
|
|
104
|
+
})];
|
|
105
|
+
req.temperature = config.llm.temperature;
|
|
106
|
+
req.max_tokens = config.llm.max_tokens;
|
|
107
|
+
req.response_format = Some(liter_llm::ResponseFormat::JsonSchema {
|
|
108
|
+
json_schema: liter_llm::JsonSchemaFormat {
|
|
109
|
+
name: config.schema_name.clone(),
|
|
110
|
+
description: config.schema_description.clone(),
|
|
111
|
+
schema: sanitized_schema,
|
|
112
|
+
strict: Some(config.strict),
|
|
113
|
+
},
|
|
114
|
+
});
|
|
115
|
+
req
|
|
116
|
+
};
|
|
112
117
|
|
|
113
118
|
let response = client
|
|
114
119
|
.chat(request)
|
|
@@ -136,12 +136,17 @@ pub async fn vlm_ocr(
|
|
|
136
136
|
name: None,
|
|
137
137
|
});
|
|
138
138
|
|
|
139
|
-
// Use mutable default because `stream` is pub(crate) in liter-llm
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
request
|
|
143
|
-
|
|
144
|
-
|
|
139
|
+
// Use mutable default because `stream` is pub(crate) in liter-llm; struct-init
|
|
140
|
+
// syntax with `..Default::default()` won't compile across the crate boundary.
|
|
141
|
+
#[allow(clippy::field_reassign_with_default)]
|
|
142
|
+
let request = {
|
|
143
|
+
let mut req = ChatCompletionRequest::default();
|
|
144
|
+
req.model = config.model.clone();
|
|
145
|
+
req.messages = vec![message];
|
|
146
|
+
req.temperature = config.temperature;
|
|
147
|
+
req.max_tokens = config.max_tokens;
|
|
148
|
+
req
|
|
149
|
+
};
|
|
145
150
|
|
|
146
151
|
let response = client.chat(request).await.map_err(|e| {
|
|
147
152
|
crate::KreuzbergError::ocr(format!(
|
|
@@ -36,6 +36,7 @@ pub(crate) fn current_pdf_path() -> Option<PathBuf> {
|
|
|
36
36
|
/// Returns segments per page (indexed by page number, 0-based).
|
|
37
37
|
/// Returns `None` if pdf_oxide fails to open or extract the document.
|
|
38
38
|
#[cfg(feature = "pdf")]
|
|
39
|
+
#[allow(unused_mut)] // pdf is mutated under feature-gated paths only
|
|
39
40
|
pub(crate) fn extract_segments_with_oxide(page_count: usize) -> Option<Vec<Vec<SegmentData>>> {
|
|
40
41
|
let file_path = match current_pdf_path() {
|
|
41
42
|
Some(p) => {
|
|
@@ -98,6 +98,33 @@ async fn test_rtf_accent_extraction() {
|
|
|
98
98
|
);
|
|
99
99
|
}
|
|
100
100
|
|
|
101
|
+
/// Test extraction of RTF file with CP1251 hex byte escapes.
|
|
102
|
+
///
|
|
103
|
+
/// File: ansicpg1251.rtf
|
|
104
|
+
/// Content: Cyrillic text encoded as `\'hh` bytes with `\ansicpg1251`
|
|
105
|
+
/// Expected: Decodes byte escapes with the declared ANSI codepage
|
|
106
|
+
#[tokio::test]
|
|
107
|
+
async fn test_rtf_ansicpg1251_extraction() {
|
|
108
|
+
let config = ExtractionConfig::default();
|
|
109
|
+
let path = get_rtf_path("ansicpg1251.rtf");
|
|
110
|
+
|
|
111
|
+
let result = extract_file(&path, Some("application/rtf"), &config).await;
|
|
112
|
+
|
|
113
|
+
assert!(result.is_ok(), "RTF extraction should succeed for ansicpg1251.rtf");
|
|
114
|
+
let extraction = result.expect("Operation failed");
|
|
115
|
+
|
|
116
|
+
assert_eq!(extraction.mime_type, "application/rtf");
|
|
117
|
+
assert!(
|
|
118
|
+
extraction.content.contains("Привет, мир!"),
|
|
119
|
+
"Should decode CP1251 hex escapes as Cyrillic text (found: {})",
|
|
120
|
+
extraction.content
|
|
121
|
+
);
|
|
122
|
+
assert!(
|
|
123
|
+
!extraction.content.contains("Ïðèâåò"),
|
|
124
|
+
"Should not decode CP1251 bytes as Windows-1252 mojibake"
|
|
125
|
+
);
|
|
126
|
+
}
|
|
127
|
+
|
|
101
128
|
/// Test extraction of RTF file with bookmarks (internal anchors/references).
|
|
102
129
|
///
|
|
103
130
|
/// File: bookmark.rtf
|
|
@@ -531,6 +558,7 @@ async fn test_rtf_no_critical_content_loss() {
|
|
|
531
558
|
|
|
532
559
|
let must_extract = vec![
|
|
533
560
|
"unicode.rtf",
|
|
561
|
+
"ansicpg1251.rtf",
|
|
534
562
|
"accent.rtf",
|
|
535
563
|
"heading.rtf",
|
|
536
564
|
"list_simple.rtf",
|
|
@@ -574,7 +602,13 @@ async fn test_rtf_no_critical_content_loss() {
|
|
|
574
602
|
async fn test_rtf_mime_type_preservation() {
|
|
575
603
|
let config = ExtractionConfig::default();
|
|
576
604
|
|
|
577
|
-
let test_files = vec![
|
|
605
|
+
let test_files = vec![
|
|
606
|
+
"unicode.rtf",
|
|
607
|
+
"ansicpg1251.rtf",
|
|
608
|
+
"accent.rtf",
|
|
609
|
+
"heading.rtf",
|
|
610
|
+
"list_simple.rtf",
|
|
611
|
+
];
|
|
578
612
|
|
|
579
613
|
for filename in test_files {
|
|
580
614
|
let path = get_rtf_path(filename);
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-ffi"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.8"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -28,14 +28,14 @@ tree-sitter = ["kreuzberg/tree-sitter"]
|
|
|
28
28
|
[dependencies]
|
|
29
29
|
ahash = { version = "0.8.12", features = ["serde"] }
|
|
30
30
|
async-trait = "0.1.89"
|
|
31
|
-
ctor = "0
|
|
32
|
-
html-to-markdown-rs = { version = "3.
|
|
33
|
-
kreuzberg = { path = "../kreuzberg", version = "4.9.
|
|
31
|
+
ctor = "1.0"
|
|
32
|
+
html-to-markdown-rs = { version = "3.4.1", default-features = false }
|
|
33
|
+
kreuzberg = { path = "../kreuzberg", version = "4.9.8", default-features = false, features = ["bundled-pdfium", "full"] }
|
|
34
34
|
log = "0.4"
|
|
35
35
|
rayon = { version = "1.12.0", optional = true }
|
|
36
36
|
serde = { version = "1.0.228", features = ["derive"] }
|
|
37
37
|
serde_json = { version = "1.0.149" }
|
|
38
|
-
tokio = { version = "1.52.
|
|
38
|
+
tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
39
39
|
|
|
40
40
|
[build-dependencies]
|
|
41
41
|
cbindgen = "0.29"
|
|
@@ -108,7 +108,7 @@ pub use types::*;
|
|
|
108
108
|
pub use util::{kreuzberg_last_error, kreuzberg_last_error_code, kreuzberg_last_panic_context, kreuzberg_version};
|
|
109
109
|
pub use validation::*;
|
|
110
110
|
|
|
111
|
-
#[ctor::ctor]
|
|
111
|
+
#[ctor::ctor(unsafe)]
|
|
112
112
|
fn setup_onnx_runtime_path() {
|
|
113
113
|
kreuzberg::ort_discovery::ensure_ort_available();
|
|
114
114
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-tesseract"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.8"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -38,21 +38,21 @@ dynamic-linking = []
|
|
|
38
38
|
thiserror = "2.0.18"
|
|
39
39
|
|
|
40
40
|
[build-dependencies]
|
|
41
|
-
cc = { version = "^1.2.
|
|
41
|
+
cc = { version = "^1.2.62", optional = true }
|
|
42
42
|
cmake = { version = "0.1.58", optional = true }
|
|
43
43
|
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
|
|
44
44
|
"deflate-flate2-zlib-rs",
|
|
45
45
|
] }
|
|
46
46
|
|
|
47
47
|
[target.'cfg(not(target_os = "windows"))'.build-dependencies]
|
|
48
|
-
reqwest = { version = "0.13.
|
|
48
|
+
reqwest = { version = "0.13.3", default-features = false, features = [
|
|
49
49
|
"blocking",
|
|
50
50
|
"rustls",
|
|
51
51
|
], optional = true }
|
|
52
52
|
|
|
53
53
|
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
|
54
54
|
[target.'cfg(target_os = "windows")'.build-dependencies]
|
|
55
|
-
reqwest = { version = "0.13.
|
|
55
|
+
reqwest = { version = "0.13.3", default-features = false, features = [
|
|
56
56
|
"blocking",
|
|
57
57
|
"native-tls",
|
|
58
58
|
], optional = true }
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.9.
|
|
4
|
+
version: 4.9.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|