kreuzberg 4.9.2 → 4.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/src/config/types.rs +8 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +6 -6
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/chunking/semantic/mod.rs +132 -19
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +53 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +8 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +79 -19
- data/vendor/kreuzberg/src/core/extractor/batch.rs +14 -2
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +27 -3
- data/vendor/kreuzberg/src/core/extractor/file.rs +27 -3
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +26 -20
- data/vendor/kreuzberg/src/doc_orientation.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx/mod.rs +102 -413
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +91 -4
- data/vendor/kreuzberg/src/extraction/email.rs +72 -10
- data/vendor/kreuzberg/src/extraction/image.rs +2 -2
- data/vendor/kreuzberg/src/extraction/image_ocr.rs +6 -1
- data/vendor/kreuzberg/src/extraction/pst.rs +111 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +249 -4
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -5
- data/vendor/kreuzberg/src/extractors/docx.rs +21 -26
- data/vendor/kreuzberg/src/extractors/email.rs +12 -11
- data/vendor/kreuzberg/src/extractors/hwp.rs +18 -5
- data/vendor/kreuzberg/src/extractors/image.rs +11 -6
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +28 -1
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +51 -19
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +26 -8
- data/vendor/kreuzberg/src/llm/client.rs +26 -6
- data/vendor/kreuzberg/src/llm/vlm_ocr.rs +49 -3
- data/vendor/kreuzberg/src/mcp/params.rs +17 -1
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +1 -0
- data/vendor/kreuzberg/src/ocr/types.rs +11 -1
- data/vendor/kreuzberg/src/ort_discovery.rs +74 -22
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +108 -10
- data/vendor/kreuzberg/src/pdf/images.rs +134 -8
- data/vendor/kreuzberg/src/pdf/structure/adapters.rs +40 -1
- data/vendor/kreuzberg/src/pdf/structure/assembly.rs +32 -0
- data/vendor/kreuzberg/src/pdf/structure/bridge.rs +21 -0
- data/vendor/kreuzberg/src/pdf/structure/content_convert.rs +31 -6
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +69 -11
- data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +123 -12
- data/vendor/kreuzberg/src/rendering/djot.rs +8 -0
- data/vendor/kreuzberg/src/rendering/markdown.rs +7 -0
- data/vendor/kreuzberg/src/rendering/plain.rs +16 -7
- data/vendor/kreuzberg/src/types/formats.rs +6 -2
- data/vendor/kreuzberg/src/utils/image_decode.rs +99 -0
- data/vendor/kreuzberg/src/utils/mod.rs +8 -0
- data/vendor/kreuzberg/tests/api_consistency.rs +1 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +7 -5
- data/vendor/kreuzberg/tests/docx_ocr_integration_test.rs +84 -0
- data/vendor/kreuzberg/tests/email_integration.rs +18 -7
- data/vendor/kreuzberg/tests/extraction_timeout_tests.rs +92 -0
- data/vendor/kreuzberg/tests/gpu_acceleration.rs +419 -0
- data/vendor/kreuzberg/tests/issue_797_preset_embedding_regression.rs +75 -0
- data/vendor/kreuzberg/tests/llm_integration.rs +3 -3
- data/vendor/kreuzberg/tests/markdown_lint_quality.rs +18 -6
- data/vendor/kreuzberg/tests/mcp_integration.rs +13 -5
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +16 -20
- data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +129 -0
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +2 -0
- data/vendor/kreuzberg/tests/test_batch_extract_schema.rs +56 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +5 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-ffi/src/config/loader.rs +5 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +1 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +8 -4
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +2 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +3 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +5 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/build.rs +5 -0
- metadata +8 -3
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6921
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9f3132b44aad1652c76e8b1445b775eb3586e48661908eda794c95339f06387d
|
|
4
|
+
data.tar.gz: 2f957af07040ec2f3bcd79c299dd429a752423d714eea73bfb608a28718a6c11
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 878748ecb791e049c2de05cdc4ec7b9f6749bb265981c98ea49126108ca7c2782b92a6b5ed31d1fbfbeee83e3c45c80aaf74aacecd20f9bc428d796709afa0aa
|
|
7
|
+
data.tar.gz: ff137eb78f8fcfcc2ac357b0d9adf6d3d6fee11a448679a976678e0745905a0abcd8abfeb331028d780810d96dc47a04ec01dda94c900ec63ad4b35c124c187f
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.6" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.6"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
|
|
|
65
65
|
"time",
|
|
66
66
|
"io-util",
|
|
67
67
|
] }
|
|
68
|
-
html-to-markdown-rs = { version = "3.
|
|
68
|
+
html-to-markdown-rs = { version = "3.3.1", default-features = false }
|
|
69
69
|
|
|
70
70
|
[dev-dependencies]
|
|
71
71
|
pretty_assertions = "1.4"
|
|
@@ -54,6 +54,7 @@ pub fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
|
|
|
54
54
|
quality_thresholds: None,
|
|
55
55
|
vlm_config: None,
|
|
56
56
|
vlm_prompt: None,
|
|
57
|
+
acceleration: None,
|
|
57
58
|
};
|
|
58
59
|
|
|
59
60
|
if let Some(val) = get_kw(ruby, hash, "tesseract_config")
|
|
@@ -404,6 +405,12 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
|
|
|
404
405
|
true
|
|
405
406
|
};
|
|
406
407
|
|
|
408
|
+
let max_images_per_page = if let Some(val) = get_kw(ruby, hash, "max_images_per_page") {
|
|
409
|
+
Some(u32::try_convert(val)?)
|
|
410
|
+
} else {
|
|
411
|
+
None
|
|
412
|
+
};
|
|
413
|
+
|
|
407
414
|
let config = ImageExtractionConfig {
|
|
408
415
|
extract_images,
|
|
409
416
|
target_dpi,
|
|
@@ -412,6 +419,7 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
|
|
|
412
419
|
auto_adjust_dpi,
|
|
413
420
|
min_dpi,
|
|
414
421
|
max_dpi,
|
|
422
|
+
max_images_per_page,
|
|
415
423
|
};
|
|
416
424
|
|
|
417
425
|
Ok(config)
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.9.
|
|
5
|
+
version = "4.9.6"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -28,12 +28,12 @@ dbase = "0.7"
|
|
|
28
28
|
futures = "0.3"
|
|
29
29
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
30
30
|
hex = "0.4.3"
|
|
31
|
-
html-to-markdown-rs = { version = "3.
|
|
31
|
+
html-to-markdown-rs = { version = "3.3.1", default-features = false }
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.6", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.6" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.185"
|
|
39
39
|
liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
|
|
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
|
|
|
45
45
|
once_cell = "1.21.4"
|
|
46
46
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
47
47
|
parking_lot = "0.12.5"
|
|
48
|
-
pdf_oxide = { version = "0.3.
|
|
48
|
+
pdf_oxide = { version = "0.3.37", default-features = false }
|
|
49
49
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
50
50
|
rayon = "1.12.0"
|
|
51
51
|
reqwest = { version = "0.13.2", default-features = false }
|
|
@@ -57,7 +57,7 @@ thiserror = "2.0.18"
|
|
|
57
57
|
tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
58
58
|
toml = "1.1.2"
|
|
59
59
|
tracing = "0.1"
|
|
60
|
-
tree-sitter-language-pack = { version = "1.
|
|
60
|
+
tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false }
|
|
61
61
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
|
62
62
|
wasm-bindgen-futures = "0.4"
|
|
63
63
|
web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.6"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
|
|
271
271
|
"simd",
|
|
272
272
|
], optional = true }
|
|
273
273
|
hex = "0.4.3"
|
|
274
|
-
html-to-markdown-rs = { version = "3.
|
|
274
|
+
html-to-markdown-rs = { version = "3.3.1", default-features = false, features = [
|
|
275
275
|
"inline-images",
|
|
276
276
|
"metadata",
|
|
277
277
|
], optional = true }
|
|
@@ -287,7 +287,7 @@ image = { version = "0.25.10", default-features = false, features = [
|
|
|
287
287
|
], optional = true }
|
|
288
288
|
indexmap = "2.14.0"
|
|
289
289
|
infer = "0.19.0"
|
|
290
|
-
jotdown = "0.
|
|
290
|
+
jotdown = "0.10"
|
|
291
291
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
292
292
|
|
|
293
293
|
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
|
|
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
314
314
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
315
315
|
parking_lot = "0.12.5"
|
|
316
316
|
pastey = "0.2"
|
|
317
|
-
pdf_oxide = { version = "0.3.
|
|
317
|
+
pdf_oxide = { version = "0.3.37", default-features = false, optional = true }
|
|
318
318
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
319
319
|
pulldown-cmark = { version = "0.13" }
|
|
320
320
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
|
@@ -392,7 +392,7 @@ optional = true
|
|
|
392
392
|
# Override getrandom to enable js feature for WASM targets
|
|
393
393
|
# This is needed because ring/rustls (via ureq) depend on getrandom without js feature
|
|
394
394
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
395
|
-
tree-sitter-language-pack = { version = "1.
|
|
395
|
+
tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false, optional = true }
|
|
396
396
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
|
397
397
|
|
|
398
398
|
[build-dependencies]
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.9.
|
|
21
|
+
> **🚀 Version 4.9.6 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -25,10 +25,6 @@ const SEGMENT_SIZE: usize = 200;
|
|
|
25
25
|
#[cfg(feature = "embeddings")]
|
|
26
26
|
const DEFAULT_TOPIC_THRESHOLD: f32 = 0.75;
|
|
27
27
|
|
|
28
|
-
/// Safety ceiling for auto-budget when no embedding model is configured.
|
|
29
|
-
/// Prevents unbounded chunks in header-less documents.
|
|
30
|
-
const AUTO_BUDGET_CEILING: usize = 4000;
|
|
31
|
-
|
|
32
28
|
/// Split text into semantically coherent chunks.
|
|
33
29
|
///
|
|
34
30
|
/// Splits text into fine-grained segments, detects structural (and optionally
|
|
@@ -46,6 +42,8 @@ pub fn chunk_semantic(
|
|
|
46
42
|
});
|
|
47
43
|
}
|
|
48
44
|
|
|
45
|
+
warn_if_fallback_path(config);
|
|
46
|
+
|
|
49
47
|
let seg_size = SEGMENT_SIZE;
|
|
50
48
|
let has_markdown_headers = text.lines().any(crate::utils::markdown_utils::is_markdown_header);
|
|
51
49
|
let splitter_segments: Vec<&str> = if has_markdown_headers {
|
|
@@ -165,11 +163,33 @@ fn compute_boundaries(_segments: &[Segment<'_>], forced: &[bool], _config: &Chun
|
|
|
165
163
|
Ok(forced.to_vec())
|
|
166
164
|
}
|
|
167
165
|
|
|
168
|
-
///
|
|
166
|
+
/// Warn when the semantic chunker is invoked without an embedding model.
|
|
167
|
+
///
|
|
168
|
+
/// Without an embedding, `chunk_semantic` falls back to a structural-boundary
|
|
169
|
+
/// heuristic (ALL-CAPS headers, numbered sections, blank-line paragraphs).
|
|
170
|
+
/// Topic-similarity chunking requires an embedding model. This warning makes
|
|
171
|
+
/// the fallback mode discoverable to callers who think they're getting
|
|
172
|
+
/// embedding-driven topic detection.
|
|
173
|
+
#[cfg(feature = "embeddings")]
|
|
174
|
+
fn warn_if_fallback_path(config: &ChunkingConfig) {
|
|
175
|
+
if config.embedding.is_none() {
|
|
176
|
+
tracing::warn!(
|
|
177
|
+
"chunker_type='semantic' without an EmbeddingConfig falls back to a \
|
|
178
|
+
structural-boundary heuristic; topic-similarity chunking requires an \
|
|
179
|
+
embedding model. Either configure `embedding` or switch to \
|
|
180
|
+
chunker_type='text'/'markdown' to silence this warning."
|
|
181
|
+
);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
#[cfg(not(feature = "embeddings"))]
|
|
186
|
+
fn warn_if_fallback_path(_config: &ChunkingConfig) {}
|
|
187
|
+
|
|
188
|
+
/// Resolve the size ceiling for merged chunks.
|
|
169
189
|
///
|
|
170
|
-
/// When an embedding preset is configured, use its chunk_size
|
|
171
|
-
///
|
|
172
|
-
///
|
|
190
|
+
/// When an embedding preset is configured, use its `chunk_size` so chunks fit
|
|
191
|
+
/// in the model's context window. Otherwise honor the caller's configured
|
|
192
|
+
/// `max_characters`.
|
|
173
193
|
fn resolve_ceiling(config: &ChunkingConfig) -> usize {
|
|
174
194
|
#[cfg(feature = "embeddings")]
|
|
175
195
|
if let Some(ref emb) = config.embedding
|
|
@@ -178,8 +198,7 @@ fn resolve_ceiling(config: &ChunkingConfig) -> usize {
|
|
|
178
198
|
{
|
|
179
199
|
return size;
|
|
180
200
|
}
|
|
181
|
-
|
|
182
|
-
AUTO_BUDGET_CEILING
|
|
201
|
+
config.max_characters
|
|
183
202
|
}
|
|
184
203
|
|
|
185
204
|
#[cfg(test)]
|
|
@@ -306,30 +325,124 @@ mod tests {
|
|
|
306
325
|
}
|
|
307
326
|
|
|
308
327
|
#[test]
|
|
309
|
-
fn
|
|
310
|
-
// A large block of text with no headers
|
|
311
|
-
//
|
|
312
|
-
let text = "word ".repeat(1500); // ~7500 chars
|
|
328
|
+
fn max_characters_caps_oversized_headerless_text() {
|
|
329
|
+
// A large block of text with no headers must be split so every chunk
|
|
330
|
+
// respects the caller's configured max_characters.
|
|
331
|
+
let text = "word ".repeat(1500); // ~7500 chars
|
|
332
|
+
let max = 1000;
|
|
313
333
|
let config = ChunkingConfig {
|
|
314
|
-
max_characters:
|
|
334
|
+
max_characters: max,
|
|
315
335
|
overlap: 0,
|
|
316
336
|
trim: true,
|
|
317
337
|
chunker_type: ChunkerType::Semantic,
|
|
318
338
|
..Default::default()
|
|
319
339
|
};
|
|
320
340
|
let result = chunk_semantic(&text, &config, None).unwrap();
|
|
321
|
-
assert!(result.chunks.len() >= 2, "should split at
|
|
341
|
+
assert!(result.chunks.len() >= 2, "should split at max_characters, got 1 chunk");
|
|
322
342
|
for (i, chunk) in result.chunks.iter().enumerate() {
|
|
323
343
|
assert!(
|
|
324
|
-
chunk.content.chars().count() <=
|
|
325
|
-
"chunk {} exceeds
|
|
344
|
+
chunk.content.chars().count() <= max,
|
|
345
|
+
"chunk {} exceeds max_characters: {} > {}",
|
|
326
346
|
i,
|
|
327
347
|
chunk.content.chars().count(),
|
|
328
|
-
|
|
348
|
+
max
|
|
329
349
|
);
|
|
330
350
|
}
|
|
331
351
|
}
|
|
332
352
|
|
|
353
|
+
#[test]
|
|
354
|
+
fn max_characters_controls_fallback_chunk_size() {
|
|
355
|
+
// bb-yq35 repro: with no embedding configured, different max_characters
|
|
356
|
+
// values must produce different chunking output.
|
|
357
|
+
let sample = format!(
|
|
358
|
+
"{}{}{}",
|
|
359
|
+
"Solar panel efficiency improves. ".repeat(200),
|
|
360
|
+
"\n\nFDA clinical trials require double-blind. ".repeat(200),
|
|
361
|
+
"\n\nQuantum entanglement needs cooling. ".repeat(200),
|
|
362
|
+
);
|
|
363
|
+
|
|
364
|
+
let run = |max: usize| {
|
|
365
|
+
let config = ChunkingConfig {
|
|
366
|
+
max_characters: max,
|
|
367
|
+
overlap: 0,
|
|
368
|
+
trim: true,
|
|
369
|
+
chunker_type: ChunkerType::Semantic,
|
|
370
|
+
..Default::default()
|
|
371
|
+
};
|
|
372
|
+
chunk_semantic(&sample, &config, None).unwrap()
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
let small = run(500);
|
|
376
|
+
let large = run(1500);
|
|
377
|
+
|
|
378
|
+
assert!(
|
|
379
|
+
small.chunks.len() > large.chunks.len(),
|
|
380
|
+
"smaller max_characters must yield more chunks: small={}, large={}",
|
|
381
|
+
small.chunks.len(),
|
|
382
|
+
large.chunks.len()
|
|
383
|
+
);
|
|
384
|
+
for chunk in &small.chunks {
|
|
385
|
+
assert!(
|
|
386
|
+
chunk.content.chars().count() <= 500,
|
|
387
|
+
"small chunk exceeds cap: {}",
|
|
388
|
+
chunk.content.chars().count()
|
|
389
|
+
);
|
|
390
|
+
}
|
|
391
|
+
for chunk in &large.chunks {
|
|
392
|
+
assert!(
|
|
393
|
+
chunk.content.chars().count() <= 1500,
|
|
394
|
+
"large chunk exceeds cap: {}",
|
|
395
|
+
chunk.content.chars().count()
|
|
396
|
+
);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
#[cfg(feature = "embeddings")]
|
|
401
|
+
#[test]
|
|
402
|
+
fn semantic_without_embedding_warns() {
|
|
403
|
+
use std::io::Write;
|
|
404
|
+
use std::sync::{Arc, Mutex};
|
|
405
|
+
|
|
406
|
+
#[derive(Clone, Default)]
|
|
407
|
+
struct Buf(Arc<Mutex<Vec<u8>>>);
|
|
408
|
+
impl Write for Buf {
|
|
409
|
+
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
|
410
|
+
self.0.lock().unwrap().extend_from_slice(buf);
|
|
411
|
+
Ok(buf.len())
|
|
412
|
+
}
|
|
413
|
+
fn flush(&mut self) -> std::io::Result<()> {
|
|
414
|
+
Ok(())
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
impl<'a> tracing_subscriber::fmt::MakeWriter<'a> for Buf {
|
|
418
|
+
type Writer = Buf;
|
|
419
|
+
fn make_writer(&'a self) -> Self::Writer {
|
|
420
|
+
self.clone()
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
let buffer = Buf::default();
|
|
425
|
+
let subscriber = tracing_subscriber::fmt()
|
|
426
|
+
.with_writer(buffer.clone())
|
|
427
|
+
.with_max_level(tracing::Level::WARN)
|
|
428
|
+
.with_ansi(false)
|
|
429
|
+
.finish();
|
|
430
|
+
|
|
431
|
+
tracing::subscriber::with_default(subscriber, || {
|
|
432
|
+
let config = ChunkingConfig {
|
|
433
|
+
chunker_type: ChunkerType::Semantic,
|
|
434
|
+
..Default::default()
|
|
435
|
+
};
|
|
436
|
+
let _ = chunk_semantic("hello world", &config, None).unwrap();
|
|
437
|
+
});
|
|
438
|
+
|
|
439
|
+
let captured = String::from_utf8(buffer.0.lock().unwrap().clone()).unwrap();
|
|
440
|
+
assert!(
|
|
441
|
+
captured.contains("without an EmbeddingConfig"),
|
|
442
|
+
"expected fallback warning in captured logs, got: {captured:?}"
|
|
443
|
+
);
|
|
444
|
+
}
|
|
445
|
+
|
|
333
446
|
#[test]
|
|
334
447
|
fn sections_with_headers_produce_separate_chunks() {
|
|
335
448
|
// Each section has enough content that the segments span multiple paragraphs.
|
|
@@ -40,6 +40,18 @@ pub struct ImageExtractionConfig {
|
|
|
40
40
|
/// Maximum DPI threshold
|
|
41
41
|
#[serde(default = "default_max_dpi")]
|
|
42
42
|
pub max_dpi: i32,
|
|
43
|
+
|
|
44
|
+
/// Maximum number of image objects to extract per PDF page.
|
|
45
|
+
///
|
|
46
|
+
/// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
|
|
47
|
+
/// can trigger extremely long or indefinite extraction times when every image
|
|
48
|
+
/// object on a dense page is decoded individually via pdfium FFI. Setting this
|
|
49
|
+
/// limit causes kreuzberg to stop collecting individual images once the count
|
|
50
|
+
/// per page reaches the cap and emit a warning instead.
|
|
51
|
+
///
|
|
52
|
+
/// `None` (default) means no limit — all images are extracted.
|
|
53
|
+
#[serde(default)]
|
|
54
|
+
pub max_images_per_page: Option<u32>,
|
|
43
55
|
}
|
|
44
56
|
|
|
45
57
|
/// Token reduction configuration.
|
|
@@ -98,3 +110,44 @@ fn default_reduction_mode() -> String {
|
|
|
98
110
|
fn default_confidence() -> f64 {
|
|
99
111
|
0.8
|
|
100
112
|
}
|
|
113
|
+
|
|
114
|
+
#[cfg(test)]
|
|
115
|
+
mod tests {
|
|
116
|
+
use super::*;
|
|
117
|
+
|
|
118
|
+
#[test]
|
|
119
|
+
fn test_max_images_per_page_defaults_none() {
|
|
120
|
+
let config = ImageExtractionConfig::default();
|
|
121
|
+
assert_eq!(config.max_images_per_page, None);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
#[test]
|
|
125
|
+
fn test_max_images_per_page_serializes_as_null_when_none() {
|
|
126
|
+
let config = ImageExtractionConfig::default();
|
|
127
|
+
let json = serde_json::to_string(&config).unwrap();
|
|
128
|
+
assert!(json.contains("\"max_images_per_page\":null"));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
#[test]
|
|
132
|
+
fn test_max_images_per_page_roundtrips_via_json() {
|
|
133
|
+
let config = ImageExtractionConfig {
|
|
134
|
+
max_images_per_page: Some(50),
|
|
135
|
+
..Default::default()
|
|
136
|
+
};
|
|
137
|
+
let json = serde_json::to_string(&config).unwrap();
|
|
138
|
+
let back: ImageExtractionConfig = serde_json::from_str(&json).unwrap();
|
|
139
|
+
assert_eq!(back.max_images_per_page, Some(50));
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/// Regression test for issue #766: missing field in JSON must not break
|
|
143
|
+
/// deserialization (backwards-compat — existing configs without this key
|
|
144
|
+
/// must still deserialize cleanly).
|
|
145
|
+
#[test]
|
|
146
|
+
fn test_max_images_per_page_absent_in_json_deserializes_as_none() {
|
|
147
|
+
let json = r#"{"extract_images":true,"target_dpi":300,"max_image_dimension":4096,
|
|
148
|
+
"inject_placeholders":true,"auto_adjust_dpi":true,
|
|
149
|
+
"min_dpi":72,"max_dpi":600}"#;
|
|
150
|
+
let config: ImageExtractionConfig = serde_json::from_str(json).unwrap();
|
|
151
|
+
assert_eq!(config.max_images_per_page, None);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
@@ -271,6 +271,13 @@ pub struct OcrConfig {
|
|
|
271
271
|
/// - `{{ language }}` — The document language code (e.g., "eng", "deu").
|
|
272
272
|
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
273
273
|
pub vlm_prompt: Option<String>,
|
|
274
|
+
|
|
275
|
+
/// Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
|
|
276
|
+
///
|
|
277
|
+
/// Not user-configurable via config files — injected at runtime from
|
|
278
|
+
/// `ExtractionConfig::acceleration` before each `process_image` call.
|
|
279
|
+
#[serde(skip)]
|
|
280
|
+
pub acceleration: Option<super::acceleration::AccelerationConfig>,
|
|
274
281
|
}
|
|
275
282
|
|
|
276
283
|
impl Default for OcrConfig {
|
|
@@ -288,6 +295,7 @@ impl Default for OcrConfig {
|
|
|
288
295
|
auto_rotate: false,
|
|
289
296
|
vlm_config: None,
|
|
290
297
|
vlm_prompt: None,
|
|
298
|
+
acceleration: None,
|
|
291
299
|
}
|
|
292
300
|
}
|
|
293
301
|
}
|
|
@@ -14,11 +14,13 @@ use std::path::PathBuf;
|
|
|
14
14
|
/// * `Text` - Generic text splitter, splits on whitespace and punctuation
|
|
15
15
|
/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
|
16
16
|
/// * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
|
|
17
|
-
/// * `Semantic` - Topic-aware chunker
|
|
18
|
-
///
|
|
19
|
-
///
|
|
20
|
-
///
|
|
21
|
-
///
|
|
17
|
+
/// * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
|
|
18
|
+
/// embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
|
|
19
|
+
/// lower = more splits). Without an embedding, falls back to a
|
|
20
|
+
/// structural-boundary heuristic (ALL-CAPS headers, numbered sections,
|
|
21
|
+
/// blank-line paragraphs) and merges groups into chunks capped at
|
|
22
|
+
/// `max_characters` (default 1000). `topic_threshold` has no effect in the
|
|
23
|
+
/// fallback path. For best results, pair with an embedding model.
|
|
22
24
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
|
23
25
|
#[serde(rename_all = "lowercase")]
|
|
24
26
|
pub enum ChunkerType {
|
|
@@ -265,15 +267,10 @@ impl ChunkingConfig {
|
|
|
265
267
|
}
|
|
266
268
|
};
|
|
267
269
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
name: preset_name.clone(),
|
|
273
|
-
},
|
|
274
|
-
..EmbeddingConfig::default()
|
|
275
|
-
}),
|
|
276
|
-
};
|
|
270
|
+
// Preserve the caller's embedding choice, including None.
|
|
271
|
+
// Presets configure chunking parameters only; users must explicitly
|
|
272
|
+
// provide an EmbeddingConfig to opt into embedding generation.
|
|
273
|
+
let embedding = self.embedding.clone();
|
|
277
274
|
|
|
278
275
|
Self {
|
|
279
276
|
max_characters: preset.chunk_size,
|
|
@@ -566,11 +563,9 @@ mod tests {
|
|
|
566
563
|
let resolved = config.resolve_preset();
|
|
567
564
|
assert_eq!(resolved.max_characters, 1024);
|
|
568
565
|
assert_eq!(resolved.overlap, 100);
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
_ => panic!("Expected Preset model type"),
|
|
573
|
-
}
|
|
566
|
+
// Preset configures chunking parameters only; embedding stays None unless
|
|
567
|
+
// the caller explicitly provided one (#797).
|
|
568
|
+
assert!(resolved.embedding.is_none());
|
|
574
569
|
}
|
|
575
570
|
|
|
576
571
|
#[test]
|
|
@@ -684,4 +679,69 @@ mod tests {
|
|
|
684
679
|
_ => panic!("Expected Custom variant"),
|
|
685
680
|
}
|
|
686
681
|
}
|
|
682
|
+
|
|
683
|
+
// --- Issue #797 regression tests ---
|
|
684
|
+
|
|
685
|
+
/// Preset with no explicit embedding: embedding must remain None.
|
|
686
|
+
///
|
|
687
|
+
/// Before the fix, `resolve_preset()` would silently inject an
|
|
688
|
+
/// `EmbeddingConfig` whenever a preset was configured, causing every
|
|
689
|
+
/// chunk to have an unexpected `.embedding` field populated.
|
|
690
|
+
#[test]
|
|
691
|
+
#[cfg(feature = "embeddings")]
|
|
692
|
+
fn test_resolve_preset_does_not_inject_embedding_when_none() {
|
|
693
|
+
let config = ChunkingConfig {
|
|
694
|
+
preset: Some("multilingual".to_string()),
|
|
695
|
+
embedding: None,
|
|
696
|
+
..Default::default()
|
|
697
|
+
};
|
|
698
|
+
let resolved = config.resolve_preset();
|
|
699
|
+
assert!(
|
|
700
|
+
resolved.embedding.is_none(),
|
|
701
|
+
"preset alone must not inject an EmbeddingConfig (#797)"
|
|
702
|
+
);
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
/// Preset with an explicit embedding: the embedding must be preserved unchanged.
|
|
706
|
+
#[test]
|
|
707
|
+
#[cfg(feature = "embeddings")]
|
|
708
|
+
fn test_resolve_preset_preserves_explicit_embedding_config() {
|
|
709
|
+
let explicit = EmbeddingConfig {
|
|
710
|
+
model: EmbeddingModelType::Custom {
|
|
711
|
+
model_id: "my-org/model".to_string(),
|
|
712
|
+
dimensions: 768,
|
|
713
|
+
},
|
|
714
|
+
batch_size: 16,
|
|
715
|
+
..Default::default()
|
|
716
|
+
};
|
|
717
|
+
let config = ChunkingConfig {
|
|
718
|
+
preset: Some("multilingual".to_string()),
|
|
719
|
+
embedding: Some(explicit),
|
|
720
|
+
..Default::default()
|
|
721
|
+
};
|
|
722
|
+
let resolved = config.resolve_preset();
|
|
723
|
+
let emb = resolved
|
|
724
|
+
.embedding
|
|
725
|
+
.expect("explicit embedding must survive resolve_preset");
|
|
726
|
+
assert_eq!(emb.batch_size, 16);
|
|
727
|
+
match emb.model {
|
|
728
|
+
EmbeddingModelType::Custom { model_id, dimensions } => {
|
|
729
|
+
assert_eq!(model_id, "my-org/model");
|
|
730
|
+
assert_eq!(dimensions, 768);
|
|
731
|
+
}
|
|
732
|
+
other => panic!("expected Custom model type, got {other:?}"),
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/// No preset, no embedding: embedding must stay None (regression guard).
|
|
737
|
+
#[test]
|
|
738
|
+
fn test_resolve_preset_no_preset_no_embedding_stays_none() {
|
|
739
|
+
let config = ChunkingConfig {
|
|
740
|
+
preset: None,
|
|
741
|
+
embedding: None,
|
|
742
|
+
..Default::default()
|
|
743
|
+
};
|
|
744
|
+
let resolved = config.resolve_preset();
|
|
745
|
+
assert!(resolved.embedding.is_none(), "no-preset path must not touch embedding");
|
|
746
|
+
}
|
|
687
747
|
}
|
|
@@ -64,11 +64,16 @@ where
|
|
|
64
64
|
}
|
|
65
65
|
|
|
66
66
|
/// Run a single extraction task with semaphore gating, timing, optional timeout, and batch mode.
|
|
67
|
+
///
|
|
68
|
+
/// When `cancel_token` is provided and the timeout fires, the token is signalled so that
|
|
69
|
+
/// any blocking pdfium operations in progress can observe the cancellation at the next
|
|
70
|
+
/// inter-page checkpoint and stop early.
|
|
67
71
|
#[cfg(feature = "tokio-runtime")]
|
|
68
72
|
async fn run_timed_extraction<F, Fut>(
|
|
69
73
|
index: usize,
|
|
70
74
|
semaphore: Arc<tokio::sync::Semaphore>,
|
|
71
75
|
timeout_secs: Option<u64>,
|
|
76
|
+
cancel_token: Option<crate::cancellation::CancellationToken>,
|
|
72
77
|
extract_fn: F,
|
|
73
78
|
) -> (usize, Result<ExtractionResult>, u64)
|
|
74
79
|
where
|
|
@@ -84,6 +89,11 @@ where
|
|
|
84
89
|
Some(secs) => match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
|
|
85
90
|
Ok(inner) => inner,
|
|
86
91
|
Err(_elapsed) => {
|
|
92
|
+
// Signal the cancellation token so that any blocking pdfium thread can
|
|
93
|
+
// detect it at the next inter-page checkpoint and stop processing.
|
|
94
|
+
if let Some(ref token) = cancel_token {
|
|
95
|
+
token.cancel();
|
|
96
|
+
}
|
|
87
97
|
let elapsed_ms = start.elapsed().as_millis() as u64;
|
|
88
98
|
Err(KreuzbergError::Timeout {
|
|
89
99
|
elapsed_ms,
|
|
@@ -200,7 +210,8 @@ pub async fn batch_extract_file(
|
|
|
200
210
|
let (ref path, ref file_config) = items[index];
|
|
201
211
|
let resolved = resolve_config(&cfg, file_config);
|
|
202
212
|
let timeout = resolved.extraction_timeout_secs;
|
|
203
|
-
|
|
213
|
+
let cancel_token = resolved.cancel_token.clone();
|
|
214
|
+
run_timed_extraction(index, sem, timeout, cancel_token, || {
|
|
204
215
|
let path = path.clone();
|
|
205
216
|
async move { extract_file(&path, None, &resolved).await }
|
|
206
217
|
})
|
|
@@ -301,7 +312,8 @@ pub async fn batch_extract_bytes(
|
|
|
301
312
|
let (bytes, mime_type, file_config) = slots[index].lock().take().expect("batch item already consumed");
|
|
302
313
|
let resolved = resolve_config(&cfg, &file_config);
|
|
303
314
|
let timeout = resolved.extraction_timeout_secs;
|
|
304
|
-
|
|
315
|
+
let cancel_token = resolved.cancel_token.clone();
|
|
316
|
+
run_timed_extraction(index, sem, timeout, cancel_token, || async move {
|
|
305
317
|
extract_bytes(&bytes, &mime_type, &resolved).await
|
|
306
318
|
})
|
|
307
319
|
.await
|