kreuzberg 4.9.1 → 4.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +15 -15
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/src/config/types.rs +7 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +5 -5
- data/vendor/kreuzberg/Cargo.toml +4 -4
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/chunking/semantic/mod.rs +132 -19
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +53 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +33 -35
- data/vendor/kreuzberg/src/core/config/processing.rs +7 -5
- data/vendor/kreuzberg/src/core/extractor/batch.rs +14 -2
- data/vendor/kreuzberg/src/extraction/docx/mod.rs +102 -413
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +91 -4
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -3
- data/vendor/kreuzberg/src/extraction/pst.rs +111 -4
- data/vendor/kreuzberg/src/extractors/doc.rs +6 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +21 -26
- data/vendor/kreuzberg/src/extractors/excel.rs +3 -0
- data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +6 -1
- data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +6 -1
- data/vendor/kreuzberg/src/extractors/iwork/pages.rs +6 -1
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +32 -1
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +26 -5
- data/vendor/kreuzberg/src/extractors/ppt.rs +6 -1
- data/vendor/kreuzberg/src/layout/model_manager.rs +10 -0
- data/vendor/kreuzberg/src/llm/client.rs +26 -6
- data/vendor/kreuzberg/src/llm/vlm_ocr.rs +49 -3
- data/vendor/kreuzberg/src/pdf/structure/adapters.rs +40 -1
- data/vendor/kreuzberg/src/pdf/structure/assembly.rs +32 -0
- data/vendor/kreuzberg/src/pdf/structure/bridge.rs +21 -0
- data/vendor/kreuzberg/src/pdf/structure/content_convert.rs +31 -6
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +735 -114
- data/vendor/kreuzberg/src/pdf/structure/regions/tables.rs +24 -0
- data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +114 -12
- data/vendor/kreuzberg/tests/api_consistency.rs +1 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +7 -5
- data/vendor/kreuzberg/tests/llm_integration.rs +3 -3
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +5 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +4 -4
- data/vendor/kreuzberg-ffi/src/config/loader.rs +5 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +1 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +8 -4
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +2 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +3 -0
- data/vendor/kreuzberg-ffi/src/error.rs +9 -8
- data/vendor/kreuzberg-ffi/src/lib.rs +5 -1
- data/vendor/kreuzberg-ffi/tests/c/test_error.c +4 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3d8a203168595f6b316a165f500818abed75d89c7a82c46b5b20df996a4bb841
|
|
4
|
+
data.tar.gz: 28fd19fecd9b18597f17a783923ec3ec08cfa7b99612fec1ca8790aa5cdddbdc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: af522bff519c1082396d9a6a9480a088693791a4f50818fd1d233726082675559a3144f7758b0ee217b18cdbb1cd08236ecbb332f68c4186a26aa69b83454392
|
|
7
|
+
data.tar.gz: 4109e6dbc32c5fed518ba84940a7bc732553a178d3b67b69dad6eff5b998aad12b0a53cfe6c6d0784848cbb484e7b79a7abe9154f23d2100786f38414d8286c0
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.4" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -2127,9 +2127,9 @@ dependencies = [
|
|
|
2127
2127
|
|
|
2128
2128
|
[[package]]
|
|
2129
2129
|
name = "html-to-markdown-rs"
|
|
2130
|
-
version = "3.2.
|
|
2130
|
+
version = "3.2.6"
|
|
2131
2131
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2132
|
-
checksum = "
|
|
2132
|
+
checksum = "bc4b9f5076d013aac34a0369c73035cf68f3d9e0771ce96a99e5a02e7e3bf9d4"
|
|
2133
2133
|
dependencies = [
|
|
2134
2134
|
"ahash",
|
|
2135
2135
|
"astral-tl",
|
|
@@ -2916,7 +2916,7 @@ dependencies = [
|
|
|
2916
2916
|
|
|
2917
2917
|
[[package]]
|
|
2918
2918
|
name = "kreuzberg-rb"
|
|
2919
|
-
version = "4.9.
|
|
2919
|
+
version = "4.9.3"
|
|
2920
2920
|
dependencies = [
|
|
2921
2921
|
"async-trait",
|
|
2922
2922
|
"html-to-markdown-rs",
|
|
@@ -3040,9 +3040,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
|
|
|
3040
3040
|
|
|
3041
3041
|
[[package]]
|
|
3042
3042
|
name = "liter-llm"
|
|
3043
|
-
version = "1.2.
|
|
3043
|
+
version = "1.2.2"
|
|
3044
3044
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3045
|
-
checksum = "
|
|
3045
|
+
checksum = "4e4ce5d2d0b09f2e63537ba40b15b0a95c2d6818ed0454eb04d9593ba4a0cad3"
|
|
3046
3046
|
dependencies = [
|
|
3047
3047
|
"base64 0.22.1",
|
|
3048
3048
|
"bytes",
|
|
@@ -3634,9 +3634,9 @@ dependencies = [
|
|
|
3634
3634
|
|
|
3635
3635
|
[[package]]
|
|
3636
3636
|
name = "openssl"
|
|
3637
|
-
version = "0.10.
|
|
3637
|
+
version = "0.10.78"
|
|
3638
3638
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3639
|
-
checksum = "
|
|
3639
|
+
checksum = "f38c4372413cdaaf3cc79dd92d29d7d9f5ab09b51b10dded508fb90bb70b9222"
|
|
3640
3640
|
dependencies = [
|
|
3641
3641
|
"bitflags",
|
|
3642
3642
|
"cfg-if",
|
|
@@ -3666,9 +3666,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
|
|
|
3666
3666
|
|
|
3667
3667
|
[[package]]
|
|
3668
3668
|
name = "openssl-sys"
|
|
3669
|
-
version = "0.9.
|
|
3669
|
+
version = "0.9.114"
|
|
3670
3670
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3671
|
-
checksum = "
|
|
3671
|
+
checksum = "13ce1245cd07fcc4cfdb438f7507b0c7e4f3849a69fd84d52374c66d83741bb6"
|
|
3672
3672
|
dependencies = [
|
|
3673
3673
|
"cc",
|
|
3674
3674
|
"libc",
|
|
@@ -4619,9 +4619,9 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
|
|
|
4619
4619
|
|
|
4620
4620
|
[[package]]
|
|
4621
4621
|
name = "rustls-webpki"
|
|
4622
|
-
version = "0.103.
|
|
4622
|
+
version = "0.103.13"
|
|
4623
4623
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4624
|
-
checksum = "
|
|
4624
|
+
checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
|
|
4625
4625
|
dependencies = [
|
|
4626
4626
|
"aws-lc-rs",
|
|
4627
4627
|
"ring",
|
|
@@ -5506,7 +5506,7 @@ dependencies = [
|
|
|
5506
5506
|
"toml_datetime 1.1.1+spec-1.1.0",
|
|
5507
5507
|
"toml_parser",
|
|
5508
5508
|
"toml_writer",
|
|
5509
|
-
"winnow 1.0.
|
|
5509
|
+
"winnow 1.0.2",
|
|
5510
5510
|
]
|
|
5511
5511
|
|
|
5512
5512
|
[[package]]
|
|
@@ -5533,7 +5533,7 @@ version = "1.1.2+spec-1.1.0"
|
|
|
5533
5533
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5534
5534
|
checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526"
|
|
5535
5535
|
dependencies = [
|
|
5536
|
-
"winnow 1.0.
|
|
5536
|
+
"winnow 1.0.2",
|
|
5537
5537
|
]
|
|
5538
5538
|
|
|
5539
5539
|
[[package]]
|
|
@@ -6577,9 +6577,9 @@ checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
|
|
|
6577
6577
|
|
|
6578
6578
|
[[package]]
|
|
6579
6579
|
name = "winnow"
|
|
6580
|
-
version = "1.0.
|
|
6580
|
+
version = "1.0.2"
|
|
6581
6581
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6582
|
-
checksum = "
|
|
6582
|
+
checksum = "2ee1708bef14716a11bae175f579062d4554d95be2c6829f518df847b7b3fdd0"
|
|
6583
6583
|
|
|
6584
6584
|
[[package]]
|
|
6585
6585
|
name = "wit-bindgen"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.4"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
|
|
|
65
65
|
"time",
|
|
66
66
|
"io-util",
|
|
67
67
|
] }
|
|
68
|
-
html-to-markdown-rs = { version = "3.2.
|
|
68
|
+
html-to-markdown-rs = { version = "3.2.6", default-features = false }
|
|
69
69
|
|
|
70
70
|
[dev-dependencies]
|
|
71
71
|
pretty_assertions = "1.4"
|
|
@@ -404,6 +404,12 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
|
|
|
404
404
|
true
|
|
405
405
|
};
|
|
406
406
|
|
|
407
|
+
let max_images_per_page = if let Some(val) = get_kw(ruby, hash, "max_images_per_page") {
|
|
408
|
+
Some(u32::try_convert(val)?)
|
|
409
|
+
} else {
|
|
410
|
+
None
|
|
411
|
+
};
|
|
412
|
+
|
|
407
413
|
let config = ImageExtractionConfig {
|
|
408
414
|
extract_images,
|
|
409
415
|
target_dpi,
|
|
@@ -412,6 +418,7 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
|
|
|
412
418
|
auto_adjust_dpi,
|
|
413
419
|
min_dpi,
|
|
414
420
|
max_dpi,
|
|
421
|
+
max_images_per_page,
|
|
415
422
|
};
|
|
416
423
|
|
|
417
424
|
Ok(config)
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.9.
|
|
5
|
+
version = "4.9.4"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -28,12 +28,12 @@ dbase = "0.7"
|
|
|
28
28
|
futures = "0.3"
|
|
29
29
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
30
30
|
hex = "0.4.3"
|
|
31
|
-
html-to-markdown-rs = { version = "3.2.
|
|
31
|
+
html-to-markdown-rs = { version = "3.2.6", default-features = false }
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.4", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.4" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.185"
|
|
39
39
|
liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
|
|
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
|
|
|
45
45
|
once_cell = "1.21.4"
|
|
46
46
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
47
47
|
parking_lot = "0.12.5"
|
|
48
|
-
pdf_oxide = { version = "0.3.
|
|
48
|
+
pdf_oxide = { version = "0.3.37", default-features = false }
|
|
49
49
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
50
50
|
rayon = "1.12.0"
|
|
51
51
|
reqwest = { version = "0.13.2", default-features = false }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.4"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
|
|
271
271
|
"simd",
|
|
272
272
|
], optional = true }
|
|
273
273
|
hex = "0.4.3"
|
|
274
|
-
html-to-markdown-rs = { version = "3.2.
|
|
274
|
+
html-to-markdown-rs = { version = "3.2.6", default-features = false, features = [
|
|
275
275
|
"inline-images",
|
|
276
276
|
"metadata",
|
|
277
277
|
], optional = true }
|
|
@@ -287,7 +287,7 @@ image = { version = "0.25.10", default-features = false, features = [
|
|
|
287
287
|
], optional = true }
|
|
288
288
|
indexmap = "2.14.0"
|
|
289
289
|
infer = "0.19.0"
|
|
290
|
-
jotdown = "0.
|
|
290
|
+
jotdown = "0.10"
|
|
291
291
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
292
292
|
|
|
293
293
|
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
|
|
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
314
314
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
315
315
|
parking_lot = "0.12.5"
|
|
316
316
|
pastey = "0.2"
|
|
317
|
-
pdf_oxide = { version = "0.3.
|
|
317
|
+
pdf_oxide = { version = "0.3.37", default-features = false, optional = true }
|
|
318
318
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
319
319
|
pulldown-cmark = { version = "0.13" }
|
|
320
320
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.9.
|
|
21
|
+
> **🚀 Version 4.9.4 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -25,10 +25,6 @@ const SEGMENT_SIZE: usize = 200;
|
|
|
25
25
|
#[cfg(feature = "embeddings")]
|
|
26
26
|
const DEFAULT_TOPIC_THRESHOLD: f32 = 0.75;
|
|
27
27
|
|
|
28
|
-
/// Safety ceiling for auto-budget when no embedding model is configured.
|
|
29
|
-
/// Prevents unbounded chunks in header-less documents.
|
|
30
|
-
const AUTO_BUDGET_CEILING: usize = 4000;
|
|
31
|
-
|
|
32
28
|
/// Split text into semantically coherent chunks.
|
|
33
29
|
///
|
|
34
30
|
/// Splits text into fine-grained segments, detects structural (and optionally
|
|
@@ -46,6 +42,8 @@ pub fn chunk_semantic(
|
|
|
46
42
|
});
|
|
47
43
|
}
|
|
48
44
|
|
|
45
|
+
warn_if_fallback_path(config);
|
|
46
|
+
|
|
49
47
|
let seg_size = SEGMENT_SIZE;
|
|
50
48
|
let has_markdown_headers = text.lines().any(crate::utils::markdown_utils::is_markdown_header);
|
|
51
49
|
let splitter_segments: Vec<&str> = if has_markdown_headers {
|
|
@@ -165,11 +163,33 @@ fn compute_boundaries(_segments: &[Segment<'_>], forced: &[bool], _config: &Chun
|
|
|
165
163
|
Ok(forced.to_vec())
|
|
166
164
|
}
|
|
167
165
|
|
|
168
|
-
///
|
|
166
|
+
/// Warn when the semantic chunker is invoked without an embedding model.
|
|
167
|
+
///
|
|
168
|
+
/// Without an embedding, `chunk_semantic` falls back to a structural-boundary
|
|
169
|
+
/// heuristic (ALL-CAPS headers, numbered sections, blank-line paragraphs).
|
|
170
|
+
/// Topic-similarity chunking requires an embedding model. This warning makes
|
|
171
|
+
/// the fallback mode discoverable to callers who think they're getting
|
|
172
|
+
/// embedding-driven topic detection.
|
|
173
|
+
#[cfg(feature = "embeddings")]
|
|
174
|
+
fn warn_if_fallback_path(config: &ChunkingConfig) {
|
|
175
|
+
if config.embedding.is_none() {
|
|
176
|
+
tracing::warn!(
|
|
177
|
+
"chunker_type='semantic' without an EmbeddingConfig falls back to a \
|
|
178
|
+
structural-boundary heuristic; topic-similarity chunking requires an \
|
|
179
|
+
embedding model. Either configure `embedding` or switch to \
|
|
180
|
+
chunker_type='text'/'markdown' to silence this warning."
|
|
181
|
+
);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
#[cfg(not(feature = "embeddings"))]
|
|
186
|
+
fn warn_if_fallback_path(_config: &ChunkingConfig) {}
|
|
187
|
+
|
|
188
|
+
/// Resolve the size ceiling for merged chunks.
|
|
169
189
|
///
|
|
170
|
-
/// When an embedding preset is configured, use its chunk_size
|
|
171
|
-
///
|
|
172
|
-
///
|
|
190
|
+
/// When an embedding preset is configured, use its `chunk_size` so chunks fit
|
|
191
|
+
/// in the model's context window. Otherwise honor the caller's configured
|
|
192
|
+
/// `max_characters`.
|
|
173
193
|
fn resolve_ceiling(config: &ChunkingConfig) -> usize {
|
|
174
194
|
#[cfg(feature = "embeddings")]
|
|
175
195
|
if let Some(ref emb) = config.embedding
|
|
@@ -178,8 +198,7 @@ fn resolve_ceiling(config: &ChunkingConfig) -> usize {
|
|
|
178
198
|
{
|
|
179
199
|
return size;
|
|
180
200
|
}
|
|
181
|
-
|
|
182
|
-
AUTO_BUDGET_CEILING
|
|
201
|
+
config.max_characters
|
|
183
202
|
}
|
|
184
203
|
|
|
185
204
|
#[cfg(test)]
|
|
@@ -306,30 +325,124 @@ mod tests {
|
|
|
306
325
|
}
|
|
307
326
|
|
|
308
327
|
#[test]
|
|
309
|
-
fn
|
|
310
|
-
// A large block of text with no headers
|
|
311
|
-
//
|
|
312
|
-
let text = "word ".repeat(1500); // ~7500 chars
|
|
328
|
+
fn max_characters_caps_oversized_headerless_text() {
|
|
329
|
+
// A large block of text with no headers must be split so every chunk
|
|
330
|
+
// respects the caller's configured max_characters.
|
|
331
|
+
let text = "word ".repeat(1500); // ~7500 chars
|
|
332
|
+
let max = 1000;
|
|
313
333
|
let config = ChunkingConfig {
|
|
314
|
-
max_characters:
|
|
334
|
+
max_characters: max,
|
|
315
335
|
overlap: 0,
|
|
316
336
|
trim: true,
|
|
317
337
|
chunker_type: ChunkerType::Semantic,
|
|
318
338
|
..Default::default()
|
|
319
339
|
};
|
|
320
340
|
let result = chunk_semantic(&text, &config, None).unwrap();
|
|
321
|
-
assert!(result.chunks.len() >= 2, "should split at
|
|
341
|
+
assert!(result.chunks.len() >= 2, "should split at max_characters, got 1 chunk");
|
|
322
342
|
for (i, chunk) in result.chunks.iter().enumerate() {
|
|
323
343
|
assert!(
|
|
324
|
-
chunk.content.chars().count() <=
|
|
325
|
-
"chunk {} exceeds
|
|
344
|
+
chunk.content.chars().count() <= max,
|
|
345
|
+
"chunk {} exceeds max_characters: {} > {}",
|
|
326
346
|
i,
|
|
327
347
|
chunk.content.chars().count(),
|
|
328
|
-
|
|
348
|
+
max
|
|
329
349
|
);
|
|
330
350
|
}
|
|
331
351
|
}
|
|
332
352
|
|
|
353
|
+
#[test]
|
|
354
|
+
fn max_characters_controls_fallback_chunk_size() {
|
|
355
|
+
// bb-yq35 repro: with no embedding configured, different max_characters
|
|
356
|
+
// values must produce different chunking output.
|
|
357
|
+
let sample = format!(
|
|
358
|
+
"{}{}{}",
|
|
359
|
+
"Solar panel efficiency improves. ".repeat(200),
|
|
360
|
+
"\n\nFDA clinical trials require double-blind. ".repeat(200),
|
|
361
|
+
"\n\nQuantum entanglement needs cooling. ".repeat(200),
|
|
362
|
+
);
|
|
363
|
+
|
|
364
|
+
let run = |max: usize| {
|
|
365
|
+
let config = ChunkingConfig {
|
|
366
|
+
max_characters: max,
|
|
367
|
+
overlap: 0,
|
|
368
|
+
trim: true,
|
|
369
|
+
chunker_type: ChunkerType::Semantic,
|
|
370
|
+
..Default::default()
|
|
371
|
+
};
|
|
372
|
+
chunk_semantic(&sample, &config, None).unwrap()
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
let small = run(500);
|
|
376
|
+
let large = run(1500);
|
|
377
|
+
|
|
378
|
+
assert!(
|
|
379
|
+
small.chunks.len() > large.chunks.len(),
|
|
380
|
+
"smaller max_characters must yield more chunks: small={}, large={}",
|
|
381
|
+
small.chunks.len(),
|
|
382
|
+
large.chunks.len()
|
|
383
|
+
);
|
|
384
|
+
for chunk in &small.chunks {
|
|
385
|
+
assert!(
|
|
386
|
+
chunk.content.chars().count() <= 500,
|
|
387
|
+
"small chunk exceeds cap: {}",
|
|
388
|
+
chunk.content.chars().count()
|
|
389
|
+
);
|
|
390
|
+
}
|
|
391
|
+
for chunk in &large.chunks {
|
|
392
|
+
assert!(
|
|
393
|
+
chunk.content.chars().count() <= 1500,
|
|
394
|
+
"large chunk exceeds cap: {}",
|
|
395
|
+
chunk.content.chars().count()
|
|
396
|
+
);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
#[cfg(feature = "embeddings")]
|
|
401
|
+
#[test]
|
|
402
|
+
fn semantic_without_embedding_warns() {
|
|
403
|
+
use std::io::Write;
|
|
404
|
+
use std::sync::{Arc, Mutex};
|
|
405
|
+
|
|
406
|
+
#[derive(Clone, Default)]
|
|
407
|
+
struct Buf(Arc<Mutex<Vec<u8>>>);
|
|
408
|
+
impl Write for Buf {
|
|
409
|
+
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
|
410
|
+
self.0.lock().unwrap().extend_from_slice(buf);
|
|
411
|
+
Ok(buf.len())
|
|
412
|
+
}
|
|
413
|
+
fn flush(&mut self) -> std::io::Result<()> {
|
|
414
|
+
Ok(())
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
impl<'a> tracing_subscriber::fmt::MakeWriter<'a> for Buf {
|
|
418
|
+
type Writer = Buf;
|
|
419
|
+
fn make_writer(&'a self) -> Self::Writer {
|
|
420
|
+
self.clone()
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
let buffer = Buf::default();
|
|
425
|
+
let subscriber = tracing_subscriber::fmt()
|
|
426
|
+
.with_writer(buffer.clone())
|
|
427
|
+
.with_max_level(tracing::Level::WARN)
|
|
428
|
+
.with_ansi(false)
|
|
429
|
+
.finish();
|
|
430
|
+
|
|
431
|
+
tracing::subscriber::with_default(subscriber, || {
|
|
432
|
+
let config = ChunkingConfig {
|
|
433
|
+
chunker_type: ChunkerType::Semantic,
|
|
434
|
+
..Default::default()
|
|
435
|
+
};
|
|
436
|
+
let _ = chunk_semantic("hello world", &config, None).unwrap();
|
|
437
|
+
});
|
|
438
|
+
|
|
439
|
+
let captured = String::from_utf8(buffer.0.lock().unwrap().clone()).unwrap();
|
|
440
|
+
assert!(
|
|
441
|
+
captured.contains("without an EmbeddingConfig"),
|
|
442
|
+
"expected fallback warning in captured logs, got: {captured:?}"
|
|
443
|
+
);
|
|
444
|
+
}
|
|
445
|
+
|
|
333
446
|
#[test]
|
|
334
447
|
fn sections_with_headers_produce_separate_chunks() {
|
|
335
448
|
// Each section has enough content that the segments span multiple paragraphs.
|
|
@@ -40,6 +40,18 @@ pub struct ImageExtractionConfig {
|
|
|
40
40
|
/// Maximum DPI threshold
|
|
41
41
|
#[serde(default = "default_max_dpi")]
|
|
42
42
|
pub max_dpi: i32,
|
|
43
|
+
|
|
44
|
+
/// Maximum number of image objects to extract per PDF page.
|
|
45
|
+
///
|
|
46
|
+
/// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
|
|
47
|
+
/// can trigger extremely long or indefinite extraction times when every image
|
|
48
|
+
/// object on a dense page is decoded individually via pdfium FFI. Setting this
|
|
49
|
+
/// limit causes kreuzberg to stop collecting individual images once the count
|
|
50
|
+
/// per page reaches the cap and emit a warning instead.
|
|
51
|
+
///
|
|
52
|
+
/// `None` (default) means no limit — all images are extracted.
|
|
53
|
+
#[serde(default)]
|
|
54
|
+
pub max_images_per_page: Option<u32>,
|
|
43
55
|
}
|
|
44
56
|
|
|
45
57
|
/// Token reduction configuration.
|
|
@@ -98,3 +110,44 @@ fn default_reduction_mode() -> String {
|
|
|
98
110
|
fn default_confidence() -> f64 {
|
|
99
111
|
0.8
|
|
100
112
|
}
|
|
113
|
+
|
|
114
|
+
#[cfg(test)]
|
|
115
|
+
mod tests {
|
|
116
|
+
use super::*;
|
|
117
|
+
|
|
118
|
+
#[test]
|
|
119
|
+
fn test_max_images_per_page_defaults_none() {
|
|
120
|
+
let config = ImageExtractionConfig::default();
|
|
121
|
+
assert_eq!(config.max_images_per_page, None);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
#[test]
|
|
125
|
+
fn test_max_images_per_page_serializes_as_null_when_none() {
|
|
126
|
+
let config = ImageExtractionConfig::default();
|
|
127
|
+
let json = serde_json::to_string(&config).unwrap();
|
|
128
|
+
assert!(json.contains("\"max_images_per_page\":null"));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
#[test]
|
|
132
|
+
fn test_max_images_per_page_roundtrips_via_json() {
|
|
133
|
+
let config = ImageExtractionConfig {
|
|
134
|
+
max_images_per_page: Some(50),
|
|
135
|
+
..Default::default()
|
|
136
|
+
};
|
|
137
|
+
let json = serde_json::to_string(&config).unwrap();
|
|
138
|
+
let back: ImageExtractionConfig = serde_json::from_str(&json).unwrap();
|
|
139
|
+
assert_eq!(back.max_images_per_page, Some(50));
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/// Regression test for issue #766: missing field in JSON must not break
|
|
143
|
+
/// deserialization (backwards-compat — existing configs without this key
|
|
144
|
+
/// must still deserialize cleanly).
|
|
145
|
+
#[test]
|
|
146
|
+
fn test_max_images_per_page_absent_in_json_deserializes_as_none() {
|
|
147
|
+
let json = r#"{"extract_images":true,"target_dpi":300,"max_image_dimension":4096,
|
|
148
|
+
"inject_placeholders":true,"auto_adjust_dpi":true,
|
|
149
|
+
"min_dpi":72,"max_dpi":600}"#;
|
|
150
|
+
let config: ImageExtractionConfig = serde_json::from_str(json).unwrap();
|
|
151
|
+
assert_eq!(config.max_images_per_page, None);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
@@ -323,9 +323,12 @@ impl OcrConfig {
|
|
|
323
323
|
/// Returns the effective pipeline config.
|
|
324
324
|
///
|
|
325
325
|
/// - If `pipeline` is explicitly set, returns it.
|
|
326
|
-
/// - If `paddle-ocr`
|
|
327
|
-
/// auto-constructs
|
|
328
|
-
/// - Otherwise returns `None` (single-backend mode
|
|
326
|
+
/// - If `paddle-ocr` is compiled in and the backend is the default
|
|
327
|
+
/// (tesseract), auto-constructs `[tesseract @ 100, paddleocr @ 50]`.
|
|
328
|
+
/// - Otherwise returns `None` (single-backend mode).
|
|
329
|
+
///
|
|
330
|
+
/// Explicit non-default backend selections are honored as-is — a silent
|
|
331
|
+
/// paddleocr fallback would mask errors from the chosen backend.
|
|
329
332
|
pub fn effective_pipeline(&self) -> Option<OcrPipelineConfig> {
|
|
330
333
|
if self.pipeline.is_some() {
|
|
331
334
|
return self.pipeline.clone();
|
|
@@ -333,25 +336,28 @@ impl OcrConfig {
|
|
|
333
336
|
|
|
334
337
|
#[cfg(feature = "paddle-ocr")]
|
|
335
338
|
{
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
339
|
+
if self.backend != default_tesseract_backend() {
|
|
340
|
+
return None;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
let stages = vec![
|
|
344
|
+
OcrPipelineStage {
|
|
345
|
+
backend: self.backend.clone(),
|
|
346
|
+
priority: 100,
|
|
347
|
+
language: None,
|
|
348
|
+
tesseract_config: self.tesseract_config.clone(),
|
|
349
|
+
paddle_ocr_config: None,
|
|
350
|
+
vlm_config: self.vlm_config.clone(),
|
|
351
|
+
},
|
|
352
|
+
OcrPipelineStage {
|
|
347
353
|
backend: "paddleocr".to_string(),
|
|
348
354
|
priority: 50,
|
|
349
355
|
language: None,
|
|
350
356
|
tesseract_config: None,
|
|
351
357
|
paddle_ocr_config: self.paddle_ocr_config.clone(),
|
|
352
358
|
vlm_config: None,
|
|
353
|
-
}
|
|
354
|
-
|
|
359
|
+
},
|
|
360
|
+
];
|
|
355
361
|
Some(OcrPipelineConfig {
|
|
356
362
|
stages,
|
|
357
363
|
quality_thresholds: self.effective_thresholds(),
|
|
@@ -485,29 +491,21 @@ mod tests {
|
|
|
485
491
|
}
|
|
486
492
|
|
|
487
493
|
#[test]
|
|
488
|
-
fn
|
|
489
|
-
// When primary backend is "paddleocr", effective_pipeline should NOT add
|
|
490
|
-
// a second paddleocr stage (issue #6 fix).
|
|
494
|
+
fn test_effective_pipeline_explicit_paddleocr_no_autofallback() {
|
|
491
495
|
let config = OcrConfig {
|
|
492
496
|
backend: "paddleocr".to_string(),
|
|
493
497
|
..Default::default()
|
|
494
498
|
};
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
);
|
|
506
|
-
}
|
|
507
|
-
#[cfg(not(feature = "paddle-ocr"))]
|
|
508
|
-
{
|
|
509
|
-
assert!(result.is_none());
|
|
510
|
-
}
|
|
499
|
+
assert!(config.effective_pipeline().is_none());
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
#[test]
|
|
503
|
+
fn test_effective_pipeline_explicit_easyocr_no_autofallback() {
|
|
504
|
+
let config = OcrConfig {
|
|
505
|
+
backend: "easyocr".to_string(),
|
|
506
|
+
..Default::default()
|
|
507
|
+
};
|
|
508
|
+
assert!(config.effective_pipeline().is_none());
|
|
511
509
|
}
|
|
512
510
|
|
|
513
511
|
#[test]
|
|
@@ -14,11 +14,13 @@ use std::path::PathBuf;
|
|
|
14
14
|
/// * `Text` - Generic text splitter, splits on whitespace and punctuation
|
|
15
15
|
/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
|
16
16
|
/// * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
|
|
17
|
-
/// * `Semantic` - Topic-aware chunker
|
|
18
|
-
///
|
|
19
|
-
///
|
|
20
|
-
///
|
|
21
|
-
///
|
|
17
|
+
/// * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
|
|
18
|
+
/// embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
|
|
19
|
+
/// lower = more splits). Without an embedding, falls back to a
|
|
20
|
+
/// structural-boundary heuristic (ALL-CAPS headers, numbered sections,
|
|
21
|
+
/// blank-line paragraphs) and merges groups into chunks capped at
|
|
22
|
+
/// `max_characters` (default 1000). `topic_threshold` has no effect in the
|
|
23
|
+
/// fallback path. For best results, pair with an embedding model.
|
|
22
24
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
|
23
25
|
#[serde(rename_all = "lowercase")]
|
|
24
26
|
pub enum ChunkerType {
|