kreuzberg 4.9.8 → 4.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +7 -8
- data/vendor/kreuzberg/Cargo.toml +22 -21
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/pdf.rs +2 -5
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +6 -1
- data/vendor/kreuzberg/src/core/extractor/file.rs +6 -1
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +115 -15
- data/vendor/kreuzberg/src/embeddings/mod.rs +17 -13
- data/vendor/kreuzberg/src/extraction/email.rs +58 -7
- data/vendor/kreuzberg/src/extraction/image_ocr.rs +72 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +0 -168
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +1 -410
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +41 -15
- data/vendor/kreuzberg/src/pdf/images.rs +22 -4
- data/vendor/kreuzberg/src/pdf/mod.rs +0 -16
- data/vendor/kreuzberg/src/pdf/rendering.rs +53 -6
- data/vendor/kreuzberg/src/pdf/structure/mod.rs +0 -2
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +12 -890
- data/vendor/kreuzberg/src/table_core.rs +8 -1
- data/vendor/kreuzberg/tests/extraction_timeout_tests.rs +26 -0
- data/vendor/kreuzberg/tests/pdf_markdown_quality.rs +1 -2
- data/vendor/kreuzberg-ffi/Cargo.toml +5 -5
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-ffi/src/config/loader.rs +39 -24
- data/vendor/kreuzberg-ffi/src/config/mod.rs +0 -4
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +2 -2
- data/vendor/kreuzberg-paddle-ocr/src/ocr_utils.rs +3 -3
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +4 -4
- metadata +2 -10
- data/vendor/kreuzberg/src/pdf/oxide/annotations.rs +0 -258
- data/vendor/kreuzberg/src/pdf/oxide/hierarchy.rs +0 -235
- data/vendor/kreuzberg/src/pdf/oxide/images.rs +0 -53
- data/vendor/kreuzberg/src/pdf/oxide/metadata.rs +0 -381
- data/vendor/kreuzberg/src/pdf/oxide/mod.rs +0 -43
- data/vendor/kreuzberg/src/pdf/oxide/table.rs +0 -247
- data/vendor/kreuzberg/src/pdf/oxide/text.rs +0 -250
- data/vendor/kreuzberg/src/pdf/oxide_text.rs +0 -122
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: de92334e109bbca1bdd22469a651f146bf29eee730dfc841c3dcb4703ee3ba5b
|
|
4
|
+
data.tar.gz: 53140e24511ff0910814325859b3e7382ee9f511e2412181812607f0f3516f33
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 80b7a6fa716b1adf28d543074581d5f88984aae738669e52116c49d301b1116ed7311c4bcfa90d1853a9d98752b0d2ad13b5c0e14e9f2623217ff319c007eca2
|
|
7
|
+
data.tar.gz: fab95d4048b382cf3ff4e154d7979bc4963e35e57c802cd5f871950d460ef16c5745c66931fbbf15b02a479f0a991a9e1832a126d445c002c7bf79b92ae143b9
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.9" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.9"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -52,7 +52,7 @@ magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb51742
|
|
|
52
52
|
rb-sys = { version = "0.9.128", default-features = false, features = [
|
|
53
53
|
"stable-api-compiled-fallback",
|
|
54
54
|
] }
|
|
55
|
-
serde_json = "1.0.
|
|
55
|
+
serde_json = "1.0.150"
|
|
56
56
|
toml = "1.1.2"
|
|
57
57
|
serde_yaml_ng = "0.10"
|
|
58
58
|
tokio = { version = "1.52.3", features = [
|
|
@@ -65,7 +65,7 @@ tokio = { version = "1.52.3", features = [
|
|
|
65
65
|
"time",
|
|
66
66
|
"io-util",
|
|
67
67
|
] }
|
|
68
|
-
html-to-markdown-rs = { version = "3.
|
|
68
|
+
html-to-markdown-rs = { version = "3.5.7", default-features = false }
|
|
69
69
|
|
|
70
70
|
[dev-dependencies]
|
|
71
71
|
pretty_assertions = "1.4"
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.9.
|
|
5
|
+
version = "4.9.9"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -28,29 +28,28 @@ dbase = "0.7"
|
|
|
28
28
|
futures = "0.3"
|
|
29
29
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
30
30
|
hex = "0.4.3"
|
|
31
|
-
html-to-markdown-rs = { version = "3.
|
|
31
|
+
html-to-markdown-rs = { version = "3.5.7", default-features = false }
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.9", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.9" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.186"
|
|
39
39
|
liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false }
|
|
40
40
|
log = "0.4"
|
|
41
|
-
lzma-rust2 = { version = "0.16.
|
|
41
|
+
lzma-rust2 = { version = "0.16.4" }
|
|
42
42
|
memmap2 = "0.9"
|
|
43
43
|
minijinja = "2"
|
|
44
44
|
num_cpus = "1.17.0"
|
|
45
45
|
once_cell = "1.21.4"
|
|
46
46
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
47
47
|
parking_lot = "0.12.5"
|
|
48
|
-
pdf_oxide = { version = "0.3.49", default-features = false }
|
|
49
48
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
50
49
|
rayon = "1.12.0"
|
|
51
|
-
reqwest = { version = "0.13.
|
|
50
|
+
reqwest = { version = "0.13.4", default-features = false }
|
|
52
51
|
serde = { version = "1.0.228", features = ["derive"] }
|
|
53
|
-
serde_json = { version = "1.0.
|
|
52
|
+
serde_json = { version = "1.0.150" }
|
|
54
53
|
serde_toon_format = "0.1"
|
|
55
54
|
tempfile = "3.27.0"
|
|
56
55
|
thiserror = "2.0.18"
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.9"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -39,10 +39,6 @@ simd-utf8 = ["dep:simdutf8"]
|
|
|
39
39
|
tokio-runtime = ["dep:tokio"]
|
|
40
40
|
|
|
41
41
|
pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image", "dep:flate2", "html"]
|
|
42
|
-
# Experimental: use pdf_oxide for text extraction (pure Rust, no C++ deps).
|
|
43
|
-
# Provides cleaner word spacing for PDFs with broken font CMaps.
|
|
44
|
-
# Requires 'pdf' feature. Not included in 'full' — opt-in only.
|
|
45
|
-
pdf-oxide = ["pdf", "dep:pdf_oxide"]
|
|
46
42
|
static-pdfium = ["pdf"]
|
|
47
43
|
bundled-pdfium = ["pdf"]
|
|
48
44
|
system-pdfium = ["pdf"]
|
|
@@ -61,7 +57,14 @@ office = [
|
|
|
61
57
|
]
|
|
62
58
|
hwp = ["dep:cfb", "dep:flate2"]
|
|
63
59
|
iwork = ["dep:zip", "dep:snap"]
|
|
64
|
-
email = [
|
|
60
|
+
email = [
|
|
61
|
+
"dep:mail-parser",
|
|
62
|
+
"dep:cfb",
|
|
63
|
+
"dep:outlook-pst",
|
|
64
|
+
"dep:tempfile",
|
|
65
|
+
"dep:chrono",
|
|
66
|
+
"dep:chardetng",
|
|
67
|
+
]
|
|
65
68
|
html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"]
|
|
66
69
|
xml = ["dep:quick-xml", "dep:roxmltree"]
|
|
67
70
|
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
|
|
@@ -259,7 +262,7 @@ cfb = { version = "0.14", optional = true }
|
|
|
259
262
|
chardetng = { version = "1.0.0", optional = true }
|
|
260
263
|
chrono = { version = "0.4", optional = true }
|
|
261
264
|
comrak = { version = "0.52", default-features = false }
|
|
262
|
-
dashmap = "6.
|
|
265
|
+
dashmap = "6.2"
|
|
263
266
|
dbase = { version = "0.7", optional = true }
|
|
264
267
|
dirs = "6"
|
|
265
268
|
encoding_rs = { version = "0.8.35" }
|
|
@@ -271,7 +274,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
|
|
271
274
|
"simd",
|
|
272
275
|
], optional = true }
|
|
273
276
|
hex = "0.4.3"
|
|
274
|
-
html-to-markdown-rs = { version = "3.
|
|
277
|
+
html-to-markdown-rs = { version = "3.5.7", default-features = false, features = [
|
|
275
278
|
"inline-images",
|
|
276
279
|
"metadata",
|
|
277
280
|
], optional = true }
|
|
@@ -294,9 +297,9 @@ kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
|
|
|
294
297
|
libc = "0.2.186"
|
|
295
298
|
liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false, optional = true }
|
|
296
299
|
log = "0.4"
|
|
297
|
-
lopdf = { version = "0.
|
|
300
|
+
lopdf = { version = "0.41.0", optional = true }
|
|
298
301
|
mail-parser = { version = "0.11.3", optional = true }
|
|
299
|
-
memchr = "2.8.
|
|
302
|
+
memchr = "2.8.1"
|
|
300
303
|
memmap2 = "0.9"
|
|
301
304
|
mime_guess = "2.0"
|
|
302
305
|
minijinja = { version = "2", optional = true }
|
|
@@ -314,7 +317,6 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
314
317
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
315
318
|
parking_lot = "0.12.5"
|
|
316
319
|
pastey = "0.2"
|
|
317
|
-
pdf_oxide = { version = "0.3.49", default-features = false, optional = true }
|
|
318
320
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
319
321
|
pulldown-cmark = { version = "0.13" }
|
|
320
322
|
quick-xml = { version = "0.40.1", features = ["serialize"], optional = true }
|
|
@@ -333,22 +335,21 @@ rmp-serde = "1.3"
|
|
|
333
335
|
|
|
334
336
|
roxmltree = { version = "0.21.1", optional = true }
|
|
335
337
|
serde = { version = "1.0.228", features = ["derive"] }
|
|
336
|
-
serde_json = { version = "1.0.
|
|
338
|
+
serde_json = { version = "1.0.150" }
|
|
337
339
|
serde_toon_format = "0.1"
|
|
338
340
|
serde_yaml_ng = "0.10.0"
|
|
339
341
|
sevenz-rust2 = { version = "0.20.2", optional = true }
|
|
340
342
|
sha2 = { version = "0.11", optional = true }
|
|
341
343
|
simdutf8 = { version = "0.1", optional = true }
|
|
342
344
|
snap = { version = "1.1", optional = true }
|
|
343
|
-
tar = { version = "0.4.
|
|
345
|
+
tar = { version = "0.4.46", optional = true }
|
|
344
346
|
tempfile = { version = "3.27.0", optional = true }
|
|
345
|
-
text-splitter = { version = "0.
|
|
347
|
+
text-splitter = { version = "0.31.0", features = ["markdown"], optional = true }
|
|
346
348
|
thiserror = "2.0.18"
|
|
347
349
|
tiff = { version = "0.11", optional = true }
|
|
348
|
-
#
|
|
349
|
-
#
|
|
350
|
-
|
|
351
|
-
tokenizers = { version = "=0.22.2", optional = true, default-features = false, features = [
|
|
350
|
+
# Keep aligned with text-splitter's optional tokenizers integration so ChunkSizer
|
|
351
|
+
# is implemented for the same Tokenizer type used by Kreuzberg.
|
|
352
|
+
tokenizers = { version = "0.23.1", optional = true, default-features = false, features = [
|
|
352
353
|
"http",
|
|
353
354
|
"fancy-regex",
|
|
354
355
|
] }
|
|
@@ -357,7 +358,7 @@ toml = "1.1.2"
|
|
|
357
358
|
tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
|
|
358
359
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
|
|
359
360
|
tracing = "0.1"
|
|
360
|
-
tracing-opentelemetry = { version = "0.
|
|
361
|
+
tracing-opentelemetry = { version = "0.33", optional = true }
|
|
361
362
|
unicode-normalization = { version = "0.1.25", optional = true }
|
|
362
363
|
urlencoding = "2"
|
|
363
364
|
utoipa = { version = "5.5", features = ["axum_extras"], optional = true }
|
|
@@ -411,8 +412,8 @@ dotenvy = "0.15"
|
|
|
411
412
|
filetime = "0.2"
|
|
412
413
|
image = { version = "0.25.10", default-features = false, features = ["png"] }
|
|
413
414
|
jsonschema = "0.46"
|
|
414
|
-
serial_test = "3.
|
|
415
|
-
tar = "0.4.
|
|
415
|
+
serial_test = "3.5.0"
|
|
416
|
+
tar = "0.4.46"
|
|
416
417
|
tempfile = "3.27.0"
|
|
417
418
|
tokio = { version = "1.52.3", features = ["macros", "time"] }
|
|
418
419
|
tokio-test = "0.4"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.9.
|
|
21
|
+
> **🚀 Version 4.9.9 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -8,17 +8,14 @@ use serde::{Deserialize, Serialize};
|
|
|
8
8
|
/// PDF extraction backend selection.
|
|
9
9
|
///
|
|
10
10
|
/// Controls which PDF library is used for text extraction:
|
|
11
|
-
/// - `Pdfium`: pdfium-render (default,
|
|
12
|
-
/// - `
|
|
13
|
-
/// - `Auto`: automatically select based on available features
|
|
11
|
+
/// - `Pdfium`: pdfium-render (default, mature)
|
|
12
|
+
/// - `Auto`: automatically select the default available backend
|
|
14
13
|
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
|
|
15
14
|
#[serde(rename_all = "lowercase")]
|
|
16
15
|
pub enum PdfBackend {
|
|
17
16
|
/// Use pdfium-render backend (default).
|
|
18
17
|
#[default]
|
|
19
18
|
Pdfium,
|
|
20
|
-
/// Use pdf_oxide backend (pure Rust). Requires `pdf-oxide` feature.
|
|
21
|
-
PdfOxide,
|
|
22
19
|
/// Automatically select the best available backend.
|
|
23
20
|
Auto,
|
|
24
21
|
}
|
|
@@ -128,7 +128,12 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
128
128
|
|
|
129
129
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
130
130
|
let result = {
|
|
131
|
-
|
|
131
|
+
if config.extraction_timeout_secs.is_some() {
|
|
132
|
+
return Err(crate::KreuzbergError::Validation {
|
|
133
|
+
message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
|
|
134
|
+
source: None,
|
|
135
|
+
});
|
|
136
|
+
}
|
|
132
137
|
extraction_future.await
|
|
133
138
|
};
|
|
134
139
|
|
|
@@ -142,7 +142,12 @@ pub async fn extract_file(
|
|
|
142
142
|
|
|
143
143
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
144
144
|
let result = {
|
|
145
|
-
|
|
145
|
+
if config.extraction_timeout_secs.is_some() {
|
|
146
|
+
return Err(crate::KreuzbergError::Validation {
|
|
147
|
+
message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
|
|
148
|
+
source: None,
|
|
149
|
+
});
|
|
150
|
+
}
|
|
146
151
|
extraction_future.await
|
|
147
152
|
};
|
|
148
153
|
|
|
@@ -35,6 +35,13 @@ pub(super) fn extract_bytes_sync_impl(
|
|
|
35
35
|
let cfg = config.cloned().unwrap_or_default();
|
|
36
36
|
let cfg = cfg.normalized().into_owned();
|
|
37
37
|
|
|
38
|
+
if cfg.extraction_timeout_secs.is_some() {
|
|
39
|
+
return Err(crate::KreuzbergError::Validation {
|
|
40
|
+
message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
|
|
41
|
+
source: None,
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
|
|
38
45
|
let validated_mime = if let Some(mime) = mime_type {
|
|
39
46
|
if mime == "application/octet-stream" {
|
|
40
47
|
mime::detect_mime_type_from_bytes(content)?
|
|
@@ -37,10 +37,19 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
|
|
|
37
37
|
continue;
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
-
|
|
41
|
-
|
|
40
|
+
let normalized: String = page
|
|
41
|
+
.content
|
|
42
|
+
.split("\n\n")
|
|
43
|
+
.map(str::trim)
|
|
44
|
+
.filter(|s| !s.is_empty())
|
|
45
|
+
.collect::<Vec<_>>()
|
|
46
|
+
.join("\n\n");
|
|
47
|
+
|
|
48
|
+
// Try normalized exact match first. PDF page text can contain trailing
|
|
49
|
+
// spaces that render_plain strips before chunking.
|
|
50
|
+
if let Some(pos) = content[search_offset..].find(normalized.as_str()) {
|
|
42
51
|
let byte_start = search_offset + pos;
|
|
43
|
-
let byte_end = content.floor_char_boundary(byte_start +
|
|
52
|
+
let byte_end = content.floor_char_boundary(byte_start + normalized.len());
|
|
44
53
|
boundaries.push(PageBoundary {
|
|
45
54
|
page_number: page.page_number,
|
|
46
55
|
byte_start,
|
|
@@ -50,12 +59,12 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
|
|
|
50
59
|
continue;
|
|
51
60
|
}
|
|
52
61
|
|
|
53
|
-
// Fallback: search for first non-empty line of page content
|
|
62
|
+
// Fallback: search for first non-empty line of page content.
|
|
54
63
|
if let Some(line) = page.content.lines().find(|l| !l.trim().is_empty()).map(|l| l.trim())
|
|
55
64
|
&& let Some(pos) = content[search_offset..].find(line)
|
|
56
65
|
{
|
|
57
66
|
let byte_start = search_offset + pos;
|
|
58
|
-
let raw_end = (byte_start +
|
|
67
|
+
let raw_end = (byte_start + normalized.len()).min(content.len());
|
|
59
68
|
let byte_end = content.floor_char_boundary(raw_end);
|
|
60
69
|
boundaries.push(PageBoundary {
|
|
61
70
|
page_number: page.page_number,
|
|
@@ -176,25 +185,27 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
|
|
|
176
185
|
let resolved_config = chunking_config.resolve_preset();
|
|
177
186
|
let chunking_config = &resolved_config;
|
|
178
187
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
188
|
+
let (chunk_input, heading_source) = if config.output_format != crate::core::config::OutputFormat::Plain {
|
|
189
|
+
(
|
|
190
|
+
result.formatted_content.as_deref().unwrap_or(result.content.as_str()),
|
|
191
|
+
None,
|
|
192
|
+
)
|
|
193
|
+
} else {
|
|
194
|
+
(result.content.as_str(), result.formatted_content.as_deref())
|
|
195
|
+
};
|
|
196
|
+
|
|
183
197
|
let recomputed_boundaries: Option<Vec<PageBoundary>> = result
|
|
184
198
|
.pages
|
|
185
199
|
.as_deref()
|
|
186
|
-
.map(|pages| recompute_boundaries_from_pages(
|
|
200
|
+
.map(|pages| recompute_boundaries_from_pages(chunk_input, pages))
|
|
201
|
+
.filter(|boundaries| !boundaries.is_empty());
|
|
187
202
|
|
|
188
203
|
let page_boundaries: Option<&[PageBoundary]> = recomputed_boundaries
|
|
189
204
|
.as_deref()
|
|
190
205
|
.or_else(|| result.metadata.pages.as_ref().and_then(|ps| ps.boundaries.as_deref()));
|
|
191
206
|
|
|
192
|
-
// Pass formatted_content (markdown) for heading context resolution when available.
|
|
193
|
-
// Plain-text rendering strips heading markers, but the markdown chunker needs them
|
|
194
|
-
// to build the heading hierarchy for chunk metadata.
|
|
195
|
-
let heading_source = result.formatted_content.as_deref();
|
|
196
207
|
match crate::chunking::chunk_text_with_heading_source(
|
|
197
|
-
|
|
208
|
+
chunk_input,
|
|
198
209
|
chunking_config,
|
|
199
210
|
page_boundaries,
|
|
200
211
|
heading_source,
|
|
@@ -314,3 +325,92 @@ pub(super) fn execute_token_reduction(result: &mut ExtractionResult, config: &Ex
|
|
|
314
325
|
|
|
315
326
|
Ok(())
|
|
316
327
|
}
|
|
328
|
+
|
|
329
|
+
#[cfg(test)]
|
|
330
|
+
#[cfg(feature = "chunking")]
|
|
331
|
+
mod tests {
|
|
332
|
+
use super::*;
|
|
333
|
+
use crate::core::config::{ChunkerType, ChunkingConfig, OutputFormat};
|
|
334
|
+
use crate::types::PageContent;
|
|
335
|
+
|
|
336
|
+
fn make_page(page_number: usize, content: &str) -> PageContent {
|
|
337
|
+
PageContent {
|
|
338
|
+
page_number,
|
|
339
|
+
content: content.to_string(),
|
|
340
|
+
tables: Vec::new(),
|
|
341
|
+
images: Vec::new(),
|
|
342
|
+
hierarchy: None,
|
|
343
|
+
is_blank: None,
|
|
344
|
+
layout_regions: None,
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
fn markdown_chunking_config() -> ExtractionConfig {
|
|
349
|
+
ExtractionConfig {
|
|
350
|
+
output_format: OutputFormat::Markdown,
|
|
351
|
+
chunking: Some(ChunkingConfig {
|
|
352
|
+
max_characters: 2000,
|
|
353
|
+
overlap: 0,
|
|
354
|
+
trim: true,
|
|
355
|
+
chunker_type: ChunkerType::Markdown,
|
|
356
|
+
..Default::default()
|
|
357
|
+
}),
|
|
358
|
+
..Default::default()
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
#[test]
|
|
363
|
+
fn chunks_content_is_markdown_when_output_format_is_markdown() {
|
|
364
|
+
let mut result = ExtractionResult {
|
|
365
|
+
content: "SH-001 Luca Bianchi Common Germany 3500000".to_string(),
|
|
366
|
+
formatted_content: Some("| SH-001 | Luca Bianchi | Common | Germany | 3,500,000 |".to_string()),
|
|
367
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
368
|
+
..Default::default()
|
|
369
|
+
};
|
|
370
|
+
|
|
371
|
+
execute_chunking(&mut result, &markdown_chunking_config()).unwrap();
|
|
372
|
+
|
|
373
|
+
let chunks = result.chunks.expect("chunks must be populated");
|
|
374
|
+
assert!(!chunks.is_empty());
|
|
375
|
+
assert!(chunks.iter().any(|chunk| chunk.content.contains('|')));
|
|
376
|
+
assert!(chunks.iter().all(|chunk| !chunk.content.starts_with("SH-001 Luca")));
|
|
377
|
+
assert!(result.formatted_content.is_some());
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
#[test]
|
|
381
|
+
fn markdown_chunks_preserve_page_metadata_when_formatted_pages_match() {
|
|
382
|
+
let mut result = ExtractionResult {
|
|
383
|
+
content: "Page one text\n\nPage two text".to_string(),
|
|
384
|
+
formatted_content: Some("# Page one\n\nPage one text\n\n# Page two\n\nPage two text".to_string()),
|
|
385
|
+
pages: Some(vec![make_page(1, "Page one text"), make_page(2, "Page two text")]),
|
|
386
|
+
mime_type: Cow::Borrowed("application/pdf"),
|
|
387
|
+
..Default::default()
|
|
388
|
+
};
|
|
389
|
+
|
|
390
|
+
execute_chunking(&mut result, &markdown_chunking_config()).unwrap();
|
|
391
|
+
|
|
392
|
+
let chunks = result.chunks.expect("chunks must be populated");
|
|
393
|
+
assert!(!chunks.is_empty());
|
|
394
|
+
assert!(chunks.iter().any(|chunk| chunk.metadata.first_page.is_some()));
|
|
395
|
+
assert!(chunks.iter().any(|chunk| chunk.metadata.last_page.is_some()));
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
#[test]
|
|
399
|
+
fn recompute_boundaries_trailing_space_pages_all_resolve() {
|
|
400
|
+
let p1_raw = "Heading \n\nBody paragraph one. ";
|
|
401
|
+
let p2_raw = "Second heading \n\nBody paragraph two. ";
|
|
402
|
+
let p3_raw = "Conclusion. ";
|
|
403
|
+
let p1_norm = "Heading\n\nBody paragraph one.";
|
|
404
|
+
let p2_norm = "Second heading\n\nBody paragraph two.";
|
|
405
|
+
let p3_norm = "Conclusion.";
|
|
406
|
+
let content = format!("{p1_norm}\n\n{p2_norm}\n\n{p3_norm}");
|
|
407
|
+
|
|
408
|
+
let pages = vec![make_page(1, p1_raw), make_page(2, p2_raw), make_page(3, p3_raw)];
|
|
409
|
+
let boundaries = recompute_boundaries_from_pages(&content, &pages);
|
|
410
|
+
|
|
411
|
+
assert_eq!(boundaries.len(), 3);
|
|
412
|
+
assert_eq!(&content[boundaries[0].byte_start..boundaries[0].byte_end], p1_norm);
|
|
413
|
+
assert_eq!(&content[boundaries[1].byte_start..boundaries[1].byte_end], p2_norm);
|
|
414
|
+
assert_eq!(&content[boundaries[2].byte_start..boundaries[2].byte_end], p3_norm);
|
|
415
|
+
}
|
|
416
|
+
}
|
|
@@ -270,11 +270,13 @@ fn load_tokenizer(
|
|
|
270
270
|
{
|
|
271
271
|
for (_, value) in &map {
|
|
272
272
|
if let Some(content) = value.as_str() {
|
|
273
|
-
tokenizer
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
273
|
+
tokenizer
|
|
274
|
+
.add_special_tokens([AddedToken {
|
|
275
|
+
content: content.to_string(),
|
|
276
|
+
special: true,
|
|
277
|
+
..Default::default()
|
|
278
|
+
}])
|
|
279
|
+
.map_err(|e| crate::KreuzbergError::embedding(format!("Failed to add special token: {e}")))?;
|
|
278
280
|
} else if value.is_object()
|
|
279
281
|
&& let (Some(content), Some(single_word), Some(lstrip), Some(rstrip), Some(normalized)) = (
|
|
280
282
|
value["content"].as_str(),
|
|
@@ -284,14 +286,16 @@ fn load_tokenizer(
|
|
|
284
286
|
value["normalized"].as_bool(),
|
|
285
287
|
)
|
|
286
288
|
{
|
|
287
|
-
tokenizer
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
289
|
+
tokenizer
|
|
290
|
+
.add_special_tokens([AddedToken {
|
|
291
|
+
content: content.to_string(),
|
|
292
|
+
special: true,
|
|
293
|
+
single_word,
|
|
294
|
+
lstrip,
|
|
295
|
+
rstrip,
|
|
296
|
+
normalized,
|
|
297
|
+
}])
|
|
298
|
+
.map_err(|e| crate::KreuzbergError::embedding(format!("Failed to add special token: {e}")))?;
|
|
295
299
|
}
|
|
296
300
|
}
|
|
297
301
|
}
|
|
@@ -74,12 +74,23 @@ fn maybe_transcode_utf16(data: &[u8]) -> Option<Vec<u8>> {
|
|
|
74
74
|
(true, 2)
|
|
75
75
|
} else if data[0] == 0xFE && data[1] == 0xFF {
|
|
76
76
|
(false, 2)
|
|
77
|
-
} else if data
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
77
|
+
} else if data.len() >= 16 {
|
|
78
|
+
let is_le_heuristic = data[1] == 0x00 && data[3] == 0x00 && data[5] == 0x00 && data[7] == 0x00;
|
|
79
|
+
let is_be_heuristic = data[0] == 0x00 && data[2] == 0x00 && data[4] == 0x00 && data[6] == 0x00;
|
|
80
|
+
|
|
81
|
+
if is_le_heuristic || is_be_heuristic {
|
|
82
|
+
let mut detector = chardetng::EncodingDetector::new(chardetng::Iso2022JpDetection::Allow);
|
|
83
|
+
detector.feed(data, true);
|
|
84
|
+
let guess = detector.guess(None, chardetng::Utf8Detection::Allow);
|
|
85
|
+
|
|
86
|
+
if guess.name() == "UTF-8" || guess.name() == "windows-1252" {
|
|
87
|
+
(is_le_heuristic, 0)
|
|
88
|
+
} else {
|
|
89
|
+
return None;
|
|
90
|
+
}
|
|
91
|
+
} else {
|
|
92
|
+
return None;
|
|
93
|
+
}
|
|
83
94
|
} else {
|
|
84
95
|
return None;
|
|
85
96
|
};
|
|
@@ -553,6 +564,8 @@ Courier{\\colortbl\\red0\\green0\\blue0\r\n\\par \\pard\\plain\\f0\\fs20\\b\\i\\
|
|
|
553
564
|
\\scaps\\outline\\shadow\\imprint\\emboss\\lang1024\\sbasedon1033\\fcharset0 {\\*\\cs10 \\additive \
|
|
554
565
|
Default Paragraph Font}";
|
|
555
566
|
|
|
567
|
+
const MAX_RTF_DECOMPRESSED_CAPACITY: usize = 16 * 1024 * 1024;
|
|
568
|
+
|
|
556
569
|
/// Decompress a PR_RTF_COMPRESSED stream per the MS-OXRTFCP specification.
|
|
557
570
|
///
|
|
558
571
|
/// Returns `None` when the data is too short, has a bad magic number, or
|
|
@@ -585,7 +598,7 @@ fn decompress_rtf_compressed(data: &[u8]) -> Option<Vec<u8>> {
|
|
|
585
598
|
// comp_size includes the 12 bytes after the first u32, so input length should be comp_size - 12.
|
|
586
599
|
let end = (comp_size.saturating_sub(12)).min(input.len());
|
|
587
600
|
|
|
588
|
-
let mut output = Vec::with_capacity(raw_size as usize);
|
|
601
|
+
let mut output = Vec::with_capacity((raw_size as usize).min(MAX_RTF_DECOMPRESSED_CAPACITY));
|
|
589
602
|
let mut pos = 0usize;
|
|
590
603
|
|
|
591
604
|
while pos < end {
|
|
@@ -2105,6 +2118,44 @@ mod tests {
|
|
|
2105
2118
|
assert_eq!(headers.get("user_agent").unwrap(), "MyAgent/1.0");
|
|
2106
2119
|
}
|
|
2107
2120
|
|
|
2121
|
+
#[test]
|
|
2122
|
+
fn test_maybe_transcode_utf16_short_binary_does_not_trigger_heuristic() {
|
|
2123
|
+
assert!(maybe_transcode_utf16(&[b'M', 0, b'I', 0]).is_none());
|
|
2124
|
+
}
|
|
2125
|
+
|
|
2126
|
+
#[test]
|
|
2127
|
+
fn test_decompress_rtf_compressed_crafted_raw_size_does_not_over_allocate() {
|
|
2128
|
+
let mut data = Vec::with_capacity(20);
|
|
2129
|
+
data.extend_from_slice(&16u32.to_le_bytes());
|
|
2130
|
+
data.extend_from_slice(&0xFFFF_FFFFu32.to_le_bytes());
|
|
2131
|
+
data.extend_from_slice(&0x75465a4cu32.to_le_bytes());
|
|
2132
|
+
data.extend_from_slice(&0u32.to_le_bytes());
|
|
2133
|
+
data.extend_from_slice(&[0x00, b'A', b'B', b'C']);
|
|
2134
|
+
|
|
2135
|
+
let out = decompress_rtf_compressed(&data).expect("crafted size should not force OOM");
|
|
2136
|
+
assert!(out.len() < 16, "output should stay tiny");
|
|
2137
|
+
}
|
|
2138
|
+
|
|
2139
|
+
#[test]
|
|
2140
|
+
fn test_decompress_rtf_compressed_cap_is_hint_only() {
|
|
2141
|
+
let payload: &[u8] = &[
|
|
2142
|
+
0x00, b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', 0x00, b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P',
|
|
2143
|
+
0x00, b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X',
|
|
2144
|
+
];
|
|
2145
|
+
let comp_size = (12 + payload.len()) as u32;
|
|
2146
|
+
let raw_size = 1u32;
|
|
2147
|
+
let mut data = Vec::new();
|
|
2148
|
+
data.extend_from_slice(&comp_size.to_le_bytes());
|
|
2149
|
+
data.extend_from_slice(&raw_size.to_le_bytes());
|
|
2150
|
+
data.extend_from_slice(&0x75465a4cu32.to_le_bytes());
|
|
2151
|
+
data.extend_from_slice(&0u32.to_le_bytes());
|
|
2152
|
+
data.extend_from_slice(payload);
|
|
2153
|
+
|
|
2154
|
+
let out = decompress_rtf_compressed(&data).expect("should decompress");
|
|
2155
|
+
assert_eq!(out.len(), 24);
|
|
2156
|
+
assert_eq!(&out[..8], b"ABCDEFGH");
|
|
2157
|
+
}
|
|
2158
|
+
|
|
2108
2159
|
#[test]
|
|
2109
2160
|
fn test_decompress_rtf_compressed_too_short() {
|
|
2110
2161
|
assert!(decompress_rtf_compressed(&[0u8; 10]).is_none());
|