kreuzberg 4.9.1 → 4.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.lock +15 -15
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  5. data/ext/kreuzberg_rb/native/src/config/types.rs +7 -0
  6. data/lib/kreuzberg/version.rb +1 -1
  7. data/vendor/Cargo.toml +5 -5
  8. data/vendor/kreuzberg/Cargo.toml +4 -4
  9. data/vendor/kreuzberg/README.md +1 -1
  10. data/vendor/kreuzberg/src/chunking/semantic/mod.rs +132 -19
  11. data/vendor/kreuzberg/src/core/config/extraction/types.rs +53 -0
  12. data/vendor/kreuzberg/src/core/config/ocr.rs +33 -35
  13. data/vendor/kreuzberg/src/core/config/processing.rs +7 -5
  14. data/vendor/kreuzberg/src/core/extractor/batch.rs +14 -2
  15. data/vendor/kreuzberg/src/extraction/docx/mod.rs +102 -413
  16. data/vendor/kreuzberg/src/extraction/docx/parser.rs +91 -4
  17. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -3
  18. data/vendor/kreuzberg/src/extraction/pst.rs +111 -4
  19. data/vendor/kreuzberg/src/extractors/doc.rs +6 -1
  20. data/vendor/kreuzberg/src/extractors/docx.rs +21 -26
  21. data/vendor/kreuzberg/src/extractors/excel.rs +3 -0
  22. data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +6 -1
  23. data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +6 -1
  24. data/vendor/kreuzberg/src/extractors/iwork/pages.rs +6 -1
  25. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +32 -1
  26. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +26 -5
  27. data/vendor/kreuzberg/src/extractors/ppt.rs +6 -1
  28. data/vendor/kreuzberg/src/layout/model_manager.rs +10 -0
  29. data/vendor/kreuzberg/src/llm/client.rs +26 -6
  30. data/vendor/kreuzberg/src/llm/vlm_ocr.rs +49 -3
  31. data/vendor/kreuzberg/src/pdf/structure/adapters.rs +40 -1
  32. data/vendor/kreuzberg/src/pdf/structure/assembly.rs +32 -0
  33. data/vendor/kreuzberg/src/pdf/structure/bridge.rs +21 -0
  34. data/vendor/kreuzberg/src/pdf/structure/content_convert.rs +31 -6
  35. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +735 -114
  36. data/vendor/kreuzberg/src/pdf/structure/regions/tables.rs +24 -0
  37. data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +114 -12
  38. data/vendor/kreuzberg/tests/api_consistency.rs +1 -0
  39. data/vendor/kreuzberg/tests/config_loading_tests.rs +7 -5
  40. data/vendor/kreuzberg/tests/llm_integration.rs +3 -3
  41. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +2 -0
  42. data/vendor/kreuzberg-ffi/Cargo.toml +5 -3
  43. data/vendor/kreuzberg-ffi/kreuzberg.h +4 -4
  44. data/vendor/kreuzberg-ffi/src/config/loader.rs +5 -0
  45. data/vendor/kreuzberg-ffi/src/config/merge.rs +1 -0
  46. data/vendor/kreuzberg-ffi/src/config/mod.rs +8 -4
  47. data/vendor/kreuzberg-ffi/src/config/serialize.rs +2 -0
  48. data/vendor/kreuzberg-ffi/src/config_builder.rs +3 -0
  49. data/vendor/kreuzberg-ffi/src/error.rs +9 -8
  50. data/vendor/kreuzberg-ffi/src/lib.rs +5 -1
  51. data/vendor/kreuzberg-ffi/tests/c/test_error.c +4 -1
  52. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  53. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  54. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  55. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 559b3104e6e21f2f14a92949d427703b51ea8b35b7a643d8964b6953785aa6e1
4
- data.tar.gz: bfce92579c45ecba0da0d8e1f077ecca0bb9dd6a1e96c950e8beb8d0a39b5884
3
+ metadata.gz: 3d8a203168595f6b316a165f500818abed75d89c7a82c46b5b20df996a4bb841
4
+ data.tar.gz: 28fd19fecd9b18597f17a783923ec3ec08cfa7b99612fec1ca8790aa5cdddbdc
5
5
  SHA512:
6
- metadata.gz: 1ea8af57d65eb5008126758041df2bfe07acca9d47ebfbf9c9de79f12b5d5ff2336d55b643268d1ea420db1825eaf9bfef6e5deb7335bd2449e9ccb62800492d
7
- data.tar.gz: '019f2abaa7dcaf2b91925f7ed0b5332ce569a41a5ae1f152d698423cce1276ea68e177c2088b4377c369943e235c70584f61a76da5600d6dd8b3bd075bc266ab'
6
+ metadata.gz: af522bff519c1082396d9a6a9480a088693791a4f50818fd1d233726082675559a3144f7758b0ee217b18cdbb1cd08236ecbb332f68c4186a26aa69b83454392
7
+ data.tar.gz: 4109e6dbc32c5fed518ba84940a7bc732553a178d3b67b69dad6eff5b998aad12b0a53cfe6c6d0784848cbb484e7b79a7abe9154f23d2100786f38414d8286c0
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.1" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.4" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -2127,9 +2127,9 @@ dependencies = [
2127
2127
 
2128
2128
  [[package]]
2129
2129
  name = "html-to-markdown-rs"
2130
- version = "3.2.5"
2130
+ version = "3.2.6"
2131
2131
  source = "registry+https://github.com/rust-lang/crates.io-index"
2132
- checksum = "bcb619abe81160bba2e2185823e10f6c0793220a266f16791aa715287de322cd"
2132
+ checksum = "bc4b9f5076d013aac34a0369c73035cf68f3d9e0771ce96a99e5a02e7e3bf9d4"
2133
2133
  dependencies = [
2134
2134
  "ahash",
2135
2135
  "astral-tl",
@@ -2916,7 +2916,7 @@ dependencies = [
2916
2916
 
2917
2917
  [[package]]
2918
2918
  name = "kreuzberg-rb"
2919
- version = "4.9.1"
2919
+ version = "4.9.3"
2920
2920
  dependencies = [
2921
2921
  "async-trait",
2922
2922
  "html-to-markdown-rs",
@@ -3040,9 +3040,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
3040
3040
 
3041
3041
  [[package]]
3042
3042
  name = "liter-llm"
3043
- version = "1.2.1"
3043
+ version = "1.2.2"
3044
3044
  source = "registry+https://github.com/rust-lang/crates.io-index"
3045
- checksum = "1884be380e45da823105c85ef0fa188af81d57be7de9b65016576e1774fdd5f8"
3045
+ checksum = "4e4ce5d2d0b09f2e63537ba40b15b0a95c2d6818ed0454eb04d9593ba4a0cad3"
3046
3046
  dependencies = [
3047
3047
  "base64 0.22.1",
3048
3048
  "bytes",
@@ -3634,9 +3634,9 @@ dependencies = [
3634
3634
 
3635
3635
  [[package]]
3636
3636
  name = "openssl"
3637
- version = "0.10.77"
3637
+ version = "0.10.78"
3638
3638
  source = "registry+https://github.com/rust-lang/crates.io-index"
3639
- checksum = "bfe4646e360ec77dff7dde40ed3d6c5fee52d156ef4a62f53973d38294dad87f"
3639
+ checksum = "f38c4372413cdaaf3cc79dd92d29d7d9f5ab09b51b10dded508fb90bb70b9222"
3640
3640
  dependencies = [
3641
3641
  "bitflags",
3642
3642
  "cfg-if",
@@ -3666,9 +3666,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
3666
3666
 
3667
3667
  [[package]]
3668
3668
  name = "openssl-sys"
3669
- version = "0.9.113"
3669
+ version = "0.9.114"
3670
3670
  source = "registry+https://github.com/rust-lang/crates.io-index"
3671
- checksum = "ad2f2c0eba47118757e4c6d2bff2838f3e0523380021356e7875e858372ce644"
3671
+ checksum = "13ce1245cd07fcc4cfdb438f7507b0c7e4f3849a69fd84d52374c66d83741bb6"
3672
3672
  dependencies = [
3673
3673
  "cc",
3674
3674
  "libc",
@@ -4619,9 +4619,9 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
4619
4619
 
4620
4620
  [[package]]
4621
4621
  name = "rustls-webpki"
4622
- version = "0.103.12"
4622
+ version = "0.103.13"
4623
4623
  source = "registry+https://github.com/rust-lang/crates.io-index"
4624
- checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06"
4624
+ checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
4625
4625
  dependencies = [
4626
4626
  "aws-lc-rs",
4627
4627
  "ring",
@@ -5506,7 +5506,7 @@ dependencies = [
5506
5506
  "toml_datetime 1.1.1+spec-1.1.0",
5507
5507
  "toml_parser",
5508
5508
  "toml_writer",
5509
- "winnow 1.0.1",
5509
+ "winnow 1.0.2",
5510
5510
  ]
5511
5511
 
5512
5512
  [[package]]
@@ -5533,7 +5533,7 @@ version = "1.1.2+spec-1.1.0"
5533
5533
  source = "registry+https://github.com/rust-lang/crates.io-index"
5534
5534
  checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526"
5535
5535
  dependencies = [
5536
- "winnow 1.0.1",
5536
+ "winnow 1.0.2",
5537
5537
  ]
5538
5538
 
5539
5539
  [[package]]
@@ -6577,9 +6577,9 @@ checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
6577
6577
 
6578
6578
  [[package]]
6579
6579
  name = "winnow"
6580
- version = "1.0.1"
6580
+ version = "1.0.2"
6581
6581
  source = "registry+https://github.com/rust-lang/crates.io-index"
6582
- checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5"
6582
+ checksum = "2ee1708bef14716a11bae175f579062d4554d95be2c6829f518df847b7b3fdd0"
6583
6583
 
6584
6584
  [[package]]
6585
6585
  name = "wit-bindgen"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.9.1"
3
+ version = "4.9.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
65
65
  "time",
66
66
  "io-util",
67
67
  ] }
68
- html-to-markdown-rs = { version = "3.2.5", default-features = false }
68
+ html-to-markdown-rs = { version = "3.2.6", default-features = false }
69
69
 
70
70
  [dev-dependencies]
71
71
  pretty_assertions = "1.4"
@@ -404,6 +404,12 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
404
404
  true
405
405
  };
406
406
 
407
+ let max_images_per_page = if let Some(val) = get_kw(ruby, hash, "max_images_per_page") {
408
+ Some(u32::try_convert(val)?)
409
+ } else {
410
+ None
411
+ };
412
+
407
413
  let config = ImageExtractionConfig {
408
414
  extract_images,
409
415
  target_dpi,
@@ -412,6 +418,7 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
412
418
  auto_adjust_dpi,
413
419
  min_dpi,
414
420
  max_dpi,
421
+ max_images_per_page,
415
422
  };
416
423
 
417
424
  Ok(config)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.9.1'
4
+ VERSION = '4.9.4'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.9.1"
5
+ version = "4.9.4"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -28,12 +28,12 @@ dbase = "0.7"
28
28
  futures = "0.3"
29
29
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
30
30
  hex = "0.4.3"
31
- html-to-markdown-rs = { version = "3.2.5", default-features = false }
31
+ html-to-markdown-rs = { version = "3.2.6", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.9.1", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.1" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.9.4", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.4" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.185"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.34", default-features = false }
48
+ pdf_oxide = { version = "0.3.37", default-features = false }
49
49
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
50
  rayon = "1.12.0"
51
51
  reqwest = { version = "0.13.2", default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.9.1"
3
+ version = "4.9.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
271
271
  "simd",
272
272
  ], optional = true }
273
273
  hex = "0.4.3"
274
- html-to-markdown-rs = { version = "3.2.5", default-features = false, features = [
274
+ html-to-markdown-rs = { version = "3.2.6", default-features = false, features = [
275
275
  "inline-images",
276
276
  "metadata",
277
277
  ], optional = true }
@@ -287,7 +287,7 @@ image = { version = "0.25.10", default-features = false, features = [
287
287
  ], optional = true }
288
288
  indexmap = "2.14.0"
289
289
  infer = "0.19.0"
290
- jotdown = "0.9"
290
+ jotdown = "0.10"
291
291
  kamadak-exif = { version = "0.6.1", optional = true }
292
292
 
293
293
  kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
314
  outlook-pst = { version = "1.2.0", optional = true }
315
315
  parking_lot = "0.12.5"
316
316
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.34", default-features = false, optional = true }
317
+ pdf_oxide = { version = "0.3.37", default-features = false, optional = true }
318
318
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
319
  pulldown-cmark = { version = "0.13" }
320
320
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.9.1 Release**
21
+ > **🚀 Version 4.9.4 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -25,10 +25,6 @@ const SEGMENT_SIZE: usize = 200;
25
25
  #[cfg(feature = "embeddings")]
26
26
  const DEFAULT_TOPIC_THRESHOLD: f32 = 0.75;
27
27
 
28
- /// Safety ceiling for auto-budget when no embedding model is configured.
29
- /// Prevents unbounded chunks in header-less documents.
30
- const AUTO_BUDGET_CEILING: usize = 4000;
31
-
32
28
  /// Split text into semantically coherent chunks.
33
29
  ///
34
30
  /// Splits text into fine-grained segments, detects structural (and optionally
@@ -46,6 +42,8 @@ pub fn chunk_semantic(
46
42
  });
47
43
  }
48
44
 
45
+ warn_if_fallback_path(config);
46
+
49
47
  let seg_size = SEGMENT_SIZE;
50
48
  let has_markdown_headers = text.lines().any(crate::utils::markdown_utils::is_markdown_header);
51
49
  let splitter_segments: Vec<&str> = if has_markdown_headers {
@@ -165,11 +163,33 @@ fn compute_boundaries(_segments: &[Segment<'_>], forced: &[bool], _config: &Chun
165
163
  Ok(forced.to_vec())
166
164
  }
167
165
 
168
- /// Resolve the safety ceiling for chunk size.
166
+ /// Warn when the semantic chunker is invoked without an embedding model.
167
+ ///
168
+ /// Without an embedding, `chunk_semantic` falls back to a structural-boundary
169
+ /// heuristic (ALL-CAPS headers, numbered sections, blank-line paragraphs).
170
+ /// Topic-similarity chunking requires an embedding model. This warning makes
171
+ /// the fallback mode discoverable to callers who think they're getting
172
+ /// embedding-driven topic detection.
173
+ #[cfg(feature = "embeddings")]
174
+ fn warn_if_fallback_path(config: &ChunkingConfig) {
175
+ if config.embedding.is_none() {
176
+ tracing::warn!(
177
+ "chunker_type='semantic' without an EmbeddingConfig falls back to a \
178
+ structural-boundary heuristic; topic-similarity chunking requires an \
179
+ embedding model. Either configure `embedding` or switch to \
180
+ chunker_type='text'/'markdown' to silence this warning."
181
+ );
182
+ }
183
+ }
184
+
185
+ #[cfg(not(feature = "embeddings"))]
186
+ fn warn_if_fallback_path(_config: &ChunkingConfig) {}
187
+
188
+ /// Resolve the size ceiling for merged chunks.
169
189
  ///
170
- /// When an embedding preset is configured, use its chunk_size as the ceiling
171
- /// (chunks must fit in the model's context window). Otherwise use a generous
172
- /// default that prevents unbounded chunks in header-less documents.
190
+ /// When an embedding preset is configured, use its `chunk_size` so chunks fit
191
+ /// in the model's context window. Otherwise honor the caller's configured
192
+ /// `max_characters`.
173
193
  fn resolve_ceiling(config: &ChunkingConfig) -> usize {
174
194
  #[cfg(feature = "embeddings")]
175
195
  if let Some(ref emb) = config.embedding
@@ -178,8 +198,7 @@ fn resolve_ceiling(config: &ChunkingConfig) -> usize {
178
198
  {
179
199
  return size;
180
200
  }
181
- let _ = config;
182
- AUTO_BUDGET_CEILING
201
+ config.max_characters
183
202
  }
184
203
 
185
204
  #[cfg(test)]
@@ -306,30 +325,124 @@ mod tests {
306
325
  }
307
326
 
308
327
  #[test]
309
- fn ceiling_caps_oversized_headerless_text() {
310
- // A large block of text with no headers should be split at the ceiling,
311
- // not produce one unbounded chunk.
312
- let text = "word ".repeat(1500); // ~7500 chars, exceeds AUTO_BUDGET_CEILING
328
+ fn max_characters_caps_oversized_headerless_text() {
329
+ // A large block of text with no headers must be split so every chunk
330
+ // respects the caller's configured max_characters.
331
+ let text = "word ".repeat(1500); // ~7500 chars
332
+ let max = 1000;
313
333
  let config = ChunkingConfig {
314
- max_characters: 1000, // ignored by semantic chunker
334
+ max_characters: max,
315
335
  overlap: 0,
316
336
  trim: true,
317
337
  chunker_type: ChunkerType::Semantic,
318
338
  ..Default::default()
319
339
  };
320
340
  let result = chunk_semantic(&text, &config, None).unwrap();
321
- assert!(result.chunks.len() >= 2, "should split at ceiling, got 1 chunk");
341
+ assert!(result.chunks.len() >= 2, "should split at max_characters, got 1 chunk");
322
342
  for (i, chunk) in result.chunks.iter().enumerate() {
323
343
  assert!(
324
- chunk.content.chars().count() <= super::AUTO_BUDGET_CEILING + 100,
325
- "chunk {} exceeds ceiling: {} > {}",
344
+ chunk.content.chars().count() <= max,
345
+ "chunk {} exceeds max_characters: {} > {}",
326
346
  i,
327
347
  chunk.content.chars().count(),
328
- super::AUTO_BUDGET_CEILING
348
+ max
329
349
  );
330
350
  }
331
351
  }
332
352
 
353
+ #[test]
354
+ fn max_characters_controls_fallback_chunk_size() {
355
+ // bb-yq35 repro: with no embedding configured, different max_characters
356
+ // values must produce different chunking output.
357
+ let sample = format!(
358
+ "{}{}{}",
359
+ "Solar panel efficiency improves. ".repeat(200),
360
+ "\n\nFDA clinical trials require double-blind. ".repeat(200),
361
+ "\n\nQuantum entanglement needs cooling. ".repeat(200),
362
+ );
363
+
364
+ let run = |max: usize| {
365
+ let config = ChunkingConfig {
366
+ max_characters: max,
367
+ overlap: 0,
368
+ trim: true,
369
+ chunker_type: ChunkerType::Semantic,
370
+ ..Default::default()
371
+ };
372
+ chunk_semantic(&sample, &config, None).unwrap()
373
+ };
374
+
375
+ let small = run(500);
376
+ let large = run(1500);
377
+
378
+ assert!(
379
+ small.chunks.len() > large.chunks.len(),
380
+ "smaller max_characters must yield more chunks: small={}, large={}",
381
+ small.chunks.len(),
382
+ large.chunks.len()
383
+ );
384
+ for chunk in &small.chunks {
385
+ assert!(
386
+ chunk.content.chars().count() <= 500,
387
+ "small chunk exceeds cap: {}",
388
+ chunk.content.chars().count()
389
+ );
390
+ }
391
+ for chunk in &large.chunks {
392
+ assert!(
393
+ chunk.content.chars().count() <= 1500,
394
+ "large chunk exceeds cap: {}",
395
+ chunk.content.chars().count()
396
+ );
397
+ }
398
+ }
399
+
400
+ #[cfg(feature = "embeddings")]
401
+ #[test]
402
+ fn semantic_without_embedding_warns() {
403
+ use std::io::Write;
404
+ use std::sync::{Arc, Mutex};
405
+
406
+ #[derive(Clone, Default)]
407
+ struct Buf(Arc<Mutex<Vec<u8>>>);
408
+ impl Write for Buf {
409
+ fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
410
+ self.0.lock().unwrap().extend_from_slice(buf);
411
+ Ok(buf.len())
412
+ }
413
+ fn flush(&mut self) -> std::io::Result<()> {
414
+ Ok(())
415
+ }
416
+ }
417
+ impl<'a> tracing_subscriber::fmt::MakeWriter<'a> for Buf {
418
+ type Writer = Buf;
419
+ fn make_writer(&'a self) -> Self::Writer {
420
+ self.clone()
421
+ }
422
+ }
423
+
424
+ let buffer = Buf::default();
425
+ let subscriber = tracing_subscriber::fmt()
426
+ .with_writer(buffer.clone())
427
+ .with_max_level(tracing::Level::WARN)
428
+ .with_ansi(false)
429
+ .finish();
430
+
431
+ tracing::subscriber::with_default(subscriber, || {
432
+ let config = ChunkingConfig {
433
+ chunker_type: ChunkerType::Semantic,
434
+ ..Default::default()
435
+ };
436
+ let _ = chunk_semantic("hello world", &config, None).unwrap();
437
+ });
438
+
439
+ let captured = String::from_utf8(buffer.0.lock().unwrap().clone()).unwrap();
440
+ assert!(
441
+ captured.contains("without an EmbeddingConfig"),
442
+ "expected fallback warning in captured logs, got: {captured:?}"
443
+ );
444
+ }
445
+
333
446
  #[test]
334
447
  fn sections_with_headers_produce_separate_chunks() {
335
448
  // Each section has enough content that the segments span multiple paragraphs.
@@ -40,6 +40,18 @@ pub struct ImageExtractionConfig {
40
40
  /// Maximum DPI threshold
41
41
  #[serde(default = "default_max_dpi")]
42
42
  pub max_dpi: i32,
43
+
44
+ /// Maximum number of image objects to extract per PDF page.
45
+ ///
46
+ /// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
47
+ /// can trigger extremely long or indefinite extraction times when every image
48
+ /// object on a dense page is decoded individually via pdfium FFI. Setting this
49
+ /// limit causes kreuzberg to stop collecting individual images once the count
50
+ /// per page reaches the cap and emit a warning instead.
51
+ ///
52
+ /// `None` (default) means no limit — all images are extracted.
53
+ #[serde(default)]
54
+ pub max_images_per_page: Option<u32>,
43
55
  }
44
56
 
45
57
  /// Token reduction configuration.
@@ -98,3 +110,44 @@ fn default_reduction_mode() -> String {
98
110
  fn default_confidence() -> f64 {
99
111
  0.8
100
112
  }
113
+
114
+ #[cfg(test)]
115
+ mod tests {
116
+ use super::*;
117
+
118
+ #[test]
119
+ fn test_max_images_per_page_defaults_none() {
120
+ let config = ImageExtractionConfig::default();
121
+ assert_eq!(config.max_images_per_page, None);
122
+ }
123
+
124
+ #[test]
125
+ fn test_max_images_per_page_serializes_as_null_when_none() {
126
+ let config = ImageExtractionConfig::default();
127
+ let json = serde_json::to_string(&config).unwrap();
128
+ assert!(json.contains("\"max_images_per_page\":null"));
129
+ }
130
+
131
+ #[test]
132
+ fn test_max_images_per_page_roundtrips_via_json() {
133
+ let config = ImageExtractionConfig {
134
+ max_images_per_page: Some(50),
135
+ ..Default::default()
136
+ };
137
+ let json = serde_json::to_string(&config).unwrap();
138
+ let back: ImageExtractionConfig = serde_json::from_str(&json).unwrap();
139
+ assert_eq!(back.max_images_per_page, Some(50));
140
+ }
141
+
142
+ /// Regression test for issue #766: missing field in JSON must not break
143
+ /// deserialization (backwards-compat — existing configs without this key
144
+ /// must still deserialize cleanly).
145
+ #[test]
146
+ fn test_max_images_per_page_absent_in_json_deserializes_as_none() {
147
+ let json = r#"{"extract_images":true,"target_dpi":300,"max_image_dimension":4096,
148
+ "inject_placeholders":true,"auto_adjust_dpi":true,
149
+ "min_dpi":72,"max_dpi":600}"#;
150
+ let config: ImageExtractionConfig = serde_json::from_str(json).unwrap();
151
+ assert_eq!(config.max_images_per_page, None);
152
+ }
153
+ }
@@ -323,9 +323,12 @@ impl OcrConfig {
323
323
  /// Returns the effective pipeline config.
324
324
  ///
325
325
  /// - If `pipeline` is explicitly set, returns it.
326
- /// - If `paddle-ocr` feature is compiled in and no explicit pipeline is set,
327
- /// auto-constructs a default pipeline: primary backend (priority 100) + paddleocr (priority 50).
328
- /// - Otherwise returns `None` (single-backend mode, same as today).
326
+ /// - If `paddle-ocr` is compiled in and the backend is the default
327
+ /// (tesseract), auto-constructs `[tesseract @ 100, paddleocr @ 50]`.
328
+ /// - Otherwise returns `None` (single-backend mode).
329
+ ///
330
+ /// Explicit non-default backend selections are honored as-is — a silent
331
+ /// paddleocr fallback would mask errors from the chosen backend.
329
332
  pub fn effective_pipeline(&self) -> Option<OcrPipelineConfig> {
330
333
  if self.pipeline.is_some() {
331
334
  return self.pipeline.clone();
@@ -333,25 +336,28 @@ impl OcrConfig {
333
336
 
334
337
  #[cfg(feature = "paddle-ocr")]
335
338
  {
336
- let mut stages = vec![OcrPipelineStage {
337
- backend: self.backend.clone(),
338
- priority: 100,
339
- language: None,
340
- tesseract_config: self.tesseract_config.clone(),
341
- paddle_ocr_config: None,
342
- vlm_config: self.vlm_config.clone(),
343
- }];
344
- // Only add paddleocr fallback if primary backend isn't already paddleocr
345
- if self.backend != "paddleocr" {
346
- stages.push(OcrPipelineStage {
339
+ if self.backend != default_tesseract_backend() {
340
+ return None;
341
+ }
342
+
343
+ let stages = vec![
344
+ OcrPipelineStage {
345
+ backend: self.backend.clone(),
346
+ priority: 100,
347
+ language: None,
348
+ tesseract_config: self.tesseract_config.clone(),
349
+ paddle_ocr_config: None,
350
+ vlm_config: self.vlm_config.clone(),
351
+ },
352
+ OcrPipelineStage {
347
353
  backend: "paddleocr".to_string(),
348
354
  priority: 50,
349
355
  language: None,
350
356
  tesseract_config: None,
351
357
  paddle_ocr_config: self.paddle_ocr_config.clone(),
352
358
  vlm_config: None,
353
- });
354
- }
359
+ },
360
+ ];
355
361
  Some(OcrPipelineConfig {
356
362
  stages,
357
363
  quality_thresholds: self.effective_thresholds(),
@@ -485,29 +491,21 @@ mod tests {
485
491
  }
486
492
 
487
493
  #[test]
488
- fn test_effective_pipeline_paddleocr_backend_no_duplicate() {
489
- // When primary backend is "paddleocr", effective_pipeline should NOT add
490
- // a second paddleocr stage (issue #6 fix).
494
+ fn test_effective_pipeline_explicit_paddleocr_no_autofallback() {
491
495
  let config = OcrConfig {
492
496
  backend: "paddleocr".to_string(),
493
497
  ..Default::default()
494
498
  };
495
- let result = config.effective_pipeline();
496
- // With paddle-ocr feature: should have exactly 1 stage (no duplicate)
497
- // Without paddle-ocr feature: should be None
498
- #[cfg(feature = "paddle-ocr")]
499
- {
500
- let pipeline = result.unwrap();
501
- let paddle_count = pipeline.stages.iter().filter(|s| s.backend == "paddleocr").count();
502
- assert_eq!(
503
- paddle_count, 1,
504
- "Should not have duplicate paddleocr stages, found {paddle_count}"
505
- );
506
- }
507
- #[cfg(not(feature = "paddle-ocr"))]
508
- {
509
- assert!(result.is_none());
510
- }
499
+ assert!(config.effective_pipeline().is_none());
500
+ }
501
+
502
+ #[test]
503
+ fn test_effective_pipeline_explicit_easyocr_no_autofallback() {
504
+ let config = OcrConfig {
505
+ backend: "easyocr".to_string(),
506
+ ..Default::default()
507
+ };
508
+ assert!(config.effective_pipeline().is_none());
511
509
  }
512
510
 
513
511
  #[test]
@@ -14,11 +14,13 @@ use std::path::PathBuf;
14
14
  /// * `Text` - Generic text splitter, splits on whitespace and punctuation
15
15
  /// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
16
16
  /// * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
17
- /// * `Semantic` - Topic-aware chunker that splits at natural document boundaries
18
- /// (headers, paragraph breaks, topic shifts). Works out of the box with no extra
19
- /// configuration. Optionally add an `EmbeddingConfig` for embedding-based topic
20
- /// detection; `topic_threshold` (default 0.75) and `max_characters` (default 1000)
21
- /// are automatically applied when not specified.
17
+ /// * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
18
+ /// embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
19
+ /// lower = more splits). Without an embedding, falls back to a
20
+ /// structural-boundary heuristic (ALL-CAPS headers, numbered sections,
21
+ /// blank-line paragraphs) and merges groups into chunks capped at
22
+ /// `max_characters` (default 1000). `topic_threshold` has no effect in the
23
+ /// fallback path. For best results, pair with an embedding model.
22
24
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
23
25
  #[serde(rename_all = "lowercase")]
24
26
  pub enum ChunkerType {