kreuzberg 4.9.2 → 4.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  4. data/ext/kreuzberg_rb/native/src/config/types.rs +8 -0
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +6 -6
  7. data/vendor/kreuzberg/Cargo.toml +5 -5
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/chunking/semantic/mod.rs +132 -19
  10. data/vendor/kreuzberg/src/core/config/extraction/types.rs +53 -0
  11. data/vendor/kreuzberg/src/core/config/ocr.rs +8 -0
  12. data/vendor/kreuzberg/src/core/config/processing.rs +79 -19
  13. data/vendor/kreuzberg/src/core/extractor/batch.rs +14 -2
  14. data/vendor/kreuzberg/src/core/extractor/bytes.rs +27 -3
  15. data/vendor/kreuzberg/src/core/extractor/file.rs +27 -3
  16. data/vendor/kreuzberg/src/core/pipeline/mod.rs +26 -20
  17. data/vendor/kreuzberg/src/doc_orientation.rs +1 -1
  18. data/vendor/kreuzberg/src/extraction/docx/mod.rs +102 -413
  19. data/vendor/kreuzberg/src/extraction/docx/parser.rs +91 -4
  20. data/vendor/kreuzberg/src/extraction/email.rs +72 -10
  21. data/vendor/kreuzberg/src/extraction/image.rs +2 -2
  22. data/vendor/kreuzberg/src/extraction/image_ocr.rs +6 -1
  23. data/vendor/kreuzberg/src/extraction/pst.rs +111 -4
  24. data/vendor/kreuzberg/src/extraction/transform/content.rs +249 -4
  25. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -5
  26. data/vendor/kreuzberg/src/extractors/docx.rs +21 -26
  27. data/vendor/kreuzberg/src/extractors/email.rs +12 -11
  28. data/vendor/kreuzberg/src/extractors/hwp.rs +18 -5
  29. data/vendor/kreuzberg/src/extractors/image.rs +11 -6
  30. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +28 -1
  31. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +51 -19
  32. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +26 -8
  33. data/vendor/kreuzberg/src/llm/client.rs +26 -6
  34. data/vendor/kreuzberg/src/llm/vlm_ocr.rs +49 -3
  35. data/vendor/kreuzberg/src/mcp/params.rs +17 -1
  36. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +1 -0
  37. data/vendor/kreuzberg/src/ocr/types.rs +11 -1
  38. data/vendor/kreuzberg/src/ort_discovery.rs +74 -22
  39. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +108 -10
  40. data/vendor/kreuzberg/src/pdf/images.rs +134 -8
  41. data/vendor/kreuzberg/src/pdf/structure/adapters.rs +40 -1
  42. data/vendor/kreuzberg/src/pdf/structure/assembly.rs +32 -0
  43. data/vendor/kreuzberg/src/pdf/structure/bridge.rs +21 -0
  44. data/vendor/kreuzberg/src/pdf/structure/content_convert.rs +31 -6
  45. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +69 -11
  46. data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +123 -12
  47. data/vendor/kreuzberg/src/rendering/djot.rs +8 -0
  48. data/vendor/kreuzberg/src/rendering/markdown.rs +7 -0
  49. data/vendor/kreuzberg/src/rendering/plain.rs +16 -7
  50. data/vendor/kreuzberg/src/types/formats.rs +6 -2
  51. data/vendor/kreuzberg/src/utils/image_decode.rs +99 -0
  52. data/vendor/kreuzberg/src/utils/mod.rs +8 -0
  53. data/vendor/kreuzberg/tests/api_consistency.rs +1 -0
  54. data/vendor/kreuzberg/tests/config_loading_tests.rs +7 -5
  55. data/vendor/kreuzberg/tests/docx_ocr_integration_test.rs +84 -0
  56. data/vendor/kreuzberg/tests/email_integration.rs +18 -7
  57. data/vendor/kreuzberg/tests/extraction_timeout_tests.rs +92 -0
  58. data/vendor/kreuzberg/tests/gpu_acceleration.rs +419 -0
  59. data/vendor/kreuzberg/tests/issue_797_preset_embedding_regression.rs +75 -0
  60. data/vendor/kreuzberg/tests/llm_integration.rs +3 -3
  61. data/vendor/kreuzberg/tests/markdown_lint_quality.rs +18 -6
  62. data/vendor/kreuzberg/tests/mcp_integration.rs +13 -5
  63. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +2 -2
  64. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +16 -20
  65. data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +129 -0
  66. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +2 -0
  67. data/vendor/kreuzberg/tests/test_batch_extract_schema.rs +56 -0
  68. data/vendor/kreuzberg-ffi/Cargo.toml +5 -3
  69. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  70. data/vendor/kreuzberg-ffi/src/config/loader.rs +5 -0
  71. data/vendor/kreuzberg-ffi/src/config/merge.rs +1 -0
  72. data/vendor/kreuzberg-ffi/src/config/mod.rs +8 -4
  73. data/vendor/kreuzberg-ffi/src/config/serialize.rs +2 -0
  74. data/vendor/kreuzberg-ffi/src/config_builder.rs +3 -0
  75. data/vendor/kreuzberg-ffi/src/lib.rs +5 -1
  76. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  77. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  78. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  79. data/vendor/kreuzberg-tesseract/build.rs +5 -0
  80. metadata +8 -3
  81. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6921
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 431f2a7c19d9a4404369b7c9fe80fd10233b34b3462cf3c0076aac9b7f3065cb
4
- data.tar.gz: bf36fd9dd46ffb84374e8f64ffb1f61ac6774cc8dd56d43d404e0b8995813ffc
3
+ metadata.gz: 9f3132b44aad1652c76e8b1445b775eb3586e48661908eda794c95339f06387d
4
+ data.tar.gz: 2f957af07040ec2f3bcd79c299dd429a752423d714eea73bfb608a28718a6c11
5
5
  SHA512:
6
- metadata.gz: 2cf3ad1e3edd0778537e542bcafc121b6cd14235e271d2b9b27747f7beab7a2899362426f450a06f9ff0d0d4365982acb68300e46568887a871ad1bda6767d00
7
- data.tar.gz: e7bcbe73f50701fbd4a0cb7e75e588096c02202198b234ba6f830b846f4fabd95e051ed0c25ed4a1c1d72a933cf5ccf9d755129eb1dc54ea446a60d9d9946997
6
+ metadata.gz: 878748ecb791e049c2de05cdc4ec7b9f6749bb265981c98ea49126108ca7c2782b92a6b5ed31d1fbfbeee83e3c45c80aaf74aacecd20f9bc428d796709afa0aa
7
+ data.tar.gz: ff137eb78f8fcfcc2ac357b0d9adf6d3d6fee11a448679a976678e0745905a0abcd8abfeb331028d780810d96dc47a04ec01dda94c900ec63ad4b35c124c187f
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.6" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.9.2"
3
+ version = "4.9.6"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
65
65
  "time",
66
66
  "io-util",
67
67
  ] }
68
- html-to-markdown-rs = { version = "3.2.5", default-features = false }
68
+ html-to-markdown-rs = { version = "3.3.1", default-features = false }
69
69
 
70
70
  [dev-dependencies]
71
71
  pretty_assertions = "1.4"
@@ -54,6 +54,7 @@ pub fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
54
54
  quality_thresholds: None,
55
55
  vlm_config: None,
56
56
  vlm_prompt: None,
57
+ acceleration: None,
57
58
  };
58
59
 
59
60
  if let Some(val) = get_kw(ruby, hash, "tesseract_config")
@@ -404,6 +405,12 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
404
405
  true
405
406
  };
406
407
 
408
+ let max_images_per_page = if let Some(val) = get_kw(ruby, hash, "max_images_per_page") {
409
+ Some(u32::try_convert(val)?)
410
+ } else {
411
+ None
412
+ };
413
+
407
414
  let config = ImageExtractionConfig {
408
415
  extract_images,
409
416
  target_dpi,
@@ -412,6 +419,7 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
412
419
  auto_adjust_dpi,
413
420
  min_dpi,
414
421
  max_dpi,
422
+ max_images_per_page,
415
423
  };
416
424
 
417
425
  Ok(config)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.9.2'
4
+ VERSION = '4.9.6'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.9.2"
5
+ version = "4.9.6"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -28,12 +28,12 @@ dbase = "0.7"
28
28
  futures = "0.3"
29
29
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
30
30
  hex = "0.4.3"
31
- html-to-markdown-rs = { version = "3.2.5", default-features = false }
31
+ html-to-markdown-rs = { version = "3.3.1", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.9.2", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.2" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.9.6", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.6" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.185"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.34", default-features = false }
48
+ pdf_oxide = { version = "0.3.37", default-features = false }
49
49
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
50
  rayon = "1.12.0"
51
51
  reqwest = { version = "0.13.2", default-features = false }
@@ -57,7 +57,7 @@ thiserror = "2.0.18"
57
57
  tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
58
58
  toml = "1.1.2"
59
59
  tracing = "0.1"
60
- tree-sitter-language-pack = { version = "1.6.2", features = ["serde"], default-features = false }
60
+ tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false }
61
61
  wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
62
62
  wasm-bindgen-futures = "0.4"
63
63
  web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.9.2"
3
+ version = "4.9.6"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
271
271
  "simd",
272
272
  ], optional = true }
273
273
  hex = "0.4.3"
274
- html-to-markdown-rs = { version = "3.2.5", default-features = false, features = [
274
+ html-to-markdown-rs = { version = "3.3.1", default-features = false, features = [
275
275
  "inline-images",
276
276
  "metadata",
277
277
  ], optional = true }
@@ -287,7 +287,7 @@ image = { version = "0.25.10", default-features = false, features = [
287
287
  ], optional = true }
288
288
  indexmap = "2.14.0"
289
289
  infer = "0.19.0"
290
- jotdown = "0.9"
290
+ jotdown = "0.10"
291
291
  kamadak-exif = { version = "0.6.1", optional = true }
292
292
 
293
293
  kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
314
  outlook-pst = { version = "1.2.0", optional = true }
315
315
  parking_lot = "0.12.5"
316
316
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.34", default-features = false, optional = true }
317
+ pdf_oxide = { version = "0.3.37", default-features = false, optional = true }
318
318
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
319
  pulldown-cmark = { version = "0.13" }
320
320
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
@@ -392,7 +392,7 @@ optional = true
392
392
  # Override getrandom to enable js feature for WASM targets
393
393
  # This is needed because ring/rustls (via ureq) depend on getrandom without js feature
394
394
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
395
- tree-sitter-language-pack = { version = "1.6.2", features = ["serde"], default-features = false, optional = true }
395
+ tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false, optional = true }
396
396
  wasm-bindgen-rayon = { version = "1.3", optional = true }
397
397
 
398
398
  [build-dependencies]
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.9.2 Release**
21
+ > **🚀 Version 4.9.6 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -25,10 +25,6 @@ const SEGMENT_SIZE: usize = 200;
25
25
  #[cfg(feature = "embeddings")]
26
26
  const DEFAULT_TOPIC_THRESHOLD: f32 = 0.75;
27
27
 
28
- /// Safety ceiling for auto-budget when no embedding model is configured.
29
- /// Prevents unbounded chunks in header-less documents.
30
- const AUTO_BUDGET_CEILING: usize = 4000;
31
-
32
28
  /// Split text into semantically coherent chunks.
33
29
  ///
34
30
  /// Splits text into fine-grained segments, detects structural (and optionally
@@ -46,6 +42,8 @@ pub fn chunk_semantic(
46
42
  });
47
43
  }
48
44
 
45
+ warn_if_fallback_path(config);
46
+
49
47
  let seg_size = SEGMENT_SIZE;
50
48
  let has_markdown_headers = text.lines().any(crate::utils::markdown_utils::is_markdown_header);
51
49
  let splitter_segments: Vec<&str> = if has_markdown_headers {
@@ -165,11 +163,33 @@ fn compute_boundaries(_segments: &[Segment<'_>], forced: &[bool], _config: &Chun
165
163
  Ok(forced.to_vec())
166
164
  }
167
165
 
168
- /// Resolve the safety ceiling for chunk size.
166
+ /// Warn when the semantic chunker is invoked without an embedding model.
167
+ ///
168
+ /// Without an embedding, `chunk_semantic` falls back to a structural-boundary
169
+ /// heuristic (ALL-CAPS headers, numbered sections, blank-line paragraphs).
170
+ /// Topic-similarity chunking requires an embedding model. This warning makes
171
+ /// the fallback mode discoverable to callers who think they're getting
172
+ /// embedding-driven topic detection.
173
+ #[cfg(feature = "embeddings")]
174
+ fn warn_if_fallback_path(config: &ChunkingConfig) {
175
+ if config.embedding.is_none() {
176
+ tracing::warn!(
177
+ "chunker_type='semantic' without an EmbeddingConfig falls back to a \
178
+ structural-boundary heuristic; topic-similarity chunking requires an \
179
+ embedding model. Either configure `embedding` or switch to \
180
+ chunker_type='text'/'markdown' to silence this warning."
181
+ );
182
+ }
183
+ }
184
+
185
+ #[cfg(not(feature = "embeddings"))]
186
+ fn warn_if_fallback_path(_config: &ChunkingConfig) {}
187
+
188
+ /// Resolve the size ceiling for merged chunks.
169
189
  ///
170
- /// When an embedding preset is configured, use its chunk_size as the ceiling
171
- /// (chunks must fit in the model's context window). Otherwise use a generous
172
- /// default that prevents unbounded chunks in header-less documents.
190
+ /// When an embedding preset is configured, use its `chunk_size` so chunks fit
191
+ /// in the model's context window. Otherwise honor the caller's configured
192
+ /// `max_characters`.
173
193
  fn resolve_ceiling(config: &ChunkingConfig) -> usize {
174
194
  #[cfg(feature = "embeddings")]
175
195
  if let Some(ref emb) = config.embedding
@@ -178,8 +198,7 @@ fn resolve_ceiling(config: &ChunkingConfig) -> usize {
178
198
  {
179
199
  return size;
180
200
  }
181
- let _ = config;
182
- AUTO_BUDGET_CEILING
201
+ config.max_characters
183
202
  }
184
203
 
185
204
  #[cfg(test)]
@@ -306,30 +325,124 @@ mod tests {
306
325
  }
307
326
 
308
327
  #[test]
309
- fn ceiling_caps_oversized_headerless_text() {
310
- // A large block of text with no headers should be split at the ceiling,
311
- // not produce one unbounded chunk.
312
- let text = "word ".repeat(1500); // ~7500 chars, exceeds AUTO_BUDGET_CEILING
328
+ fn max_characters_caps_oversized_headerless_text() {
329
+ // A large block of text with no headers must be split so every chunk
330
+ // respects the caller's configured max_characters.
331
+ let text = "word ".repeat(1500); // ~7500 chars
332
+ let max = 1000;
313
333
  let config = ChunkingConfig {
314
- max_characters: 1000, // ignored by semantic chunker
334
+ max_characters: max,
315
335
  overlap: 0,
316
336
  trim: true,
317
337
  chunker_type: ChunkerType::Semantic,
318
338
  ..Default::default()
319
339
  };
320
340
  let result = chunk_semantic(&text, &config, None).unwrap();
321
- assert!(result.chunks.len() >= 2, "should split at ceiling, got 1 chunk");
341
+ assert!(result.chunks.len() >= 2, "should split at max_characters, got 1 chunk");
322
342
  for (i, chunk) in result.chunks.iter().enumerate() {
323
343
  assert!(
324
- chunk.content.chars().count() <= super::AUTO_BUDGET_CEILING + 100,
325
- "chunk {} exceeds ceiling: {} > {}",
344
+ chunk.content.chars().count() <= max,
345
+ "chunk {} exceeds max_characters: {} > {}",
326
346
  i,
327
347
  chunk.content.chars().count(),
328
- super::AUTO_BUDGET_CEILING
348
+ max
329
349
  );
330
350
  }
331
351
  }
332
352
 
353
+ #[test]
354
+ fn max_characters_controls_fallback_chunk_size() {
355
+ // bb-yq35 repro: with no embedding configured, different max_characters
356
+ // values must produce different chunking output.
357
+ let sample = format!(
358
+ "{}{}{}",
359
+ "Solar panel efficiency improves. ".repeat(200),
360
+ "\n\nFDA clinical trials require double-blind. ".repeat(200),
361
+ "\n\nQuantum entanglement needs cooling. ".repeat(200),
362
+ );
363
+
364
+ let run = |max: usize| {
365
+ let config = ChunkingConfig {
366
+ max_characters: max,
367
+ overlap: 0,
368
+ trim: true,
369
+ chunker_type: ChunkerType::Semantic,
370
+ ..Default::default()
371
+ };
372
+ chunk_semantic(&sample, &config, None).unwrap()
373
+ };
374
+
375
+ let small = run(500);
376
+ let large = run(1500);
377
+
378
+ assert!(
379
+ small.chunks.len() > large.chunks.len(),
380
+ "smaller max_characters must yield more chunks: small={}, large={}",
381
+ small.chunks.len(),
382
+ large.chunks.len()
383
+ );
384
+ for chunk in &small.chunks {
385
+ assert!(
386
+ chunk.content.chars().count() <= 500,
387
+ "small chunk exceeds cap: {}",
388
+ chunk.content.chars().count()
389
+ );
390
+ }
391
+ for chunk in &large.chunks {
392
+ assert!(
393
+ chunk.content.chars().count() <= 1500,
394
+ "large chunk exceeds cap: {}",
395
+ chunk.content.chars().count()
396
+ );
397
+ }
398
+ }
399
+
400
+ #[cfg(feature = "embeddings")]
401
+ #[test]
402
+ fn semantic_without_embedding_warns() {
403
+ use std::io::Write;
404
+ use std::sync::{Arc, Mutex};
405
+
406
+ #[derive(Clone, Default)]
407
+ struct Buf(Arc<Mutex<Vec<u8>>>);
408
+ impl Write for Buf {
409
+ fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
410
+ self.0.lock().unwrap().extend_from_slice(buf);
411
+ Ok(buf.len())
412
+ }
413
+ fn flush(&mut self) -> std::io::Result<()> {
414
+ Ok(())
415
+ }
416
+ }
417
+ impl<'a> tracing_subscriber::fmt::MakeWriter<'a> for Buf {
418
+ type Writer = Buf;
419
+ fn make_writer(&'a self) -> Self::Writer {
420
+ self.clone()
421
+ }
422
+ }
423
+
424
+ let buffer = Buf::default();
425
+ let subscriber = tracing_subscriber::fmt()
426
+ .with_writer(buffer.clone())
427
+ .with_max_level(tracing::Level::WARN)
428
+ .with_ansi(false)
429
+ .finish();
430
+
431
+ tracing::subscriber::with_default(subscriber, || {
432
+ let config = ChunkingConfig {
433
+ chunker_type: ChunkerType::Semantic,
434
+ ..Default::default()
435
+ };
436
+ let _ = chunk_semantic("hello world", &config, None).unwrap();
437
+ });
438
+
439
+ let captured = String::from_utf8(buffer.0.lock().unwrap().clone()).unwrap();
440
+ assert!(
441
+ captured.contains("without an EmbeddingConfig"),
442
+ "expected fallback warning in captured logs, got: {captured:?}"
443
+ );
444
+ }
445
+
333
446
  #[test]
334
447
  fn sections_with_headers_produce_separate_chunks() {
335
448
  // Each section has enough content that the segments span multiple paragraphs.
@@ -40,6 +40,18 @@ pub struct ImageExtractionConfig {
40
40
  /// Maximum DPI threshold
41
41
  #[serde(default = "default_max_dpi")]
42
42
  pub max_dpi: i32,
43
+
44
+ /// Maximum number of image objects to extract per PDF page.
45
+ ///
46
+ /// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
47
+ /// can trigger extremely long or indefinite extraction times when every image
48
+ /// object on a dense page is decoded individually via pdfium FFI. Setting this
49
+ /// limit causes kreuzberg to stop collecting individual images once the count
50
+ /// per page reaches the cap and emit a warning instead.
51
+ ///
52
+ /// `None` (default) means no limit — all images are extracted.
53
+ #[serde(default)]
54
+ pub max_images_per_page: Option<u32>,
43
55
  }
44
56
 
45
57
  /// Token reduction configuration.
@@ -98,3 +110,44 @@ fn default_reduction_mode() -> String {
98
110
  fn default_confidence() -> f64 {
99
111
  0.8
100
112
  }
113
+
114
+ #[cfg(test)]
115
+ mod tests {
116
+ use super::*;
117
+
118
+ #[test]
119
+ fn test_max_images_per_page_defaults_none() {
120
+ let config = ImageExtractionConfig::default();
121
+ assert_eq!(config.max_images_per_page, None);
122
+ }
123
+
124
+ #[test]
125
+ fn test_max_images_per_page_serializes_as_null_when_none() {
126
+ let config = ImageExtractionConfig::default();
127
+ let json = serde_json::to_string(&config).unwrap();
128
+ assert!(json.contains("\"max_images_per_page\":null"));
129
+ }
130
+
131
+ #[test]
132
+ fn test_max_images_per_page_roundtrips_via_json() {
133
+ let config = ImageExtractionConfig {
134
+ max_images_per_page: Some(50),
135
+ ..Default::default()
136
+ };
137
+ let json = serde_json::to_string(&config).unwrap();
138
+ let back: ImageExtractionConfig = serde_json::from_str(&json).unwrap();
139
+ assert_eq!(back.max_images_per_page, Some(50));
140
+ }
141
+
142
+ /// Regression test for issue #766: missing field in JSON must not break
143
+ /// deserialization (backwards-compat — existing configs without this key
144
+ /// must still deserialize cleanly).
145
+ #[test]
146
+ fn test_max_images_per_page_absent_in_json_deserializes_as_none() {
147
+ let json = r#"{"extract_images":true,"target_dpi":300,"max_image_dimension":4096,
148
+ "inject_placeholders":true,"auto_adjust_dpi":true,
149
+ "min_dpi":72,"max_dpi":600}"#;
150
+ let config: ImageExtractionConfig = serde_json::from_str(json).unwrap();
151
+ assert_eq!(config.max_images_per_page, None);
152
+ }
153
+ }
@@ -271,6 +271,13 @@ pub struct OcrConfig {
271
271
  /// - `{{ language }}` — The document language code (e.g., "eng", "deu").
272
272
  #[serde(default, skip_serializing_if = "Option::is_none")]
273
273
  pub vlm_prompt: Option<String>,
274
+
275
+ /// Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
276
+ ///
277
+ /// Not user-configurable via config files — injected at runtime from
278
+ /// `ExtractionConfig::acceleration` before each `process_image` call.
279
+ #[serde(skip)]
280
+ pub acceleration: Option<super::acceleration::AccelerationConfig>,
274
281
  }
275
282
 
276
283
  impl Default for OcrConfig {
@@ -288,6 +295,7 @@ impl Default for OcrConfig {
288
295
  auto_rotate: false,
289
296
  vlm_config: None,
290
297
  vlm_prompt: None,
298
+ acceleration: None,
291
299
  }
292
300
  }
293
301
  }
@@ -14,11 +14,13 @@ use std::path::PathBuf;
14
14
  /// * `Text` - Generic text splitter, splits on whitespace and punctuation
15
15
  /// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
16
16
  /// * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
17
- /// * `Semantic` - Topic-aware chunker that splits at natural document boundaries
18
- /// (headers, paragraph breaks, topic shifts). Works out of the box with no extra
19
- /// configuration. Optionally add an `EmbeddingConfig` for embedding-based topic
20
- /// detection; `topic_threshold` (default 0.75) and `max_characters` (default 1000)
21
- /// are automatically applied when not specified.
17
+ /// * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
18
+ /// embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
19
+ /// lower = more splits). Without an embedding, falls back to a
20
+ /// structural-boundary heuristic (ALL-CAPS headers, numbered sections,
21
+ /// blank-line paragraphs) and merges groups into chunks capped at
22
+ /// `max_characters` (default 1000). `topic_threshold` has no effect in the
23
+ /// fallback path. For best results, pair with an embedding model.
22
24
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
23
25
  #[serde(rename_all = "lowercase")]
24
26
  pub enum ChunkerType {
@@ -265,15 +267,10 @@ impl ChunkingConfig {
265
267
  }
266
268
  };
267
269
 
268
- let embedding = match &self.embedding {
269
- Some(existing) => Some(existing.clone()),
270
- None => Some(EmbeddingConfig {
271
- model: EmbeddingModelType::Preset {
272
- name: preset_name.clone(),
273
- },
274
- ..EmbeddingConfig::default()
275
- }),
276
- };
270
+ // Preserve the caller's embedding choice, including None.
271
+ // Presets configure chunking parameters only; users must explicitly
272
+ // provide an EmbeddingConfig to opt into embedding generation.
273
+ let embedding = self.embedding.clone();
277
274
 
278
275
  Self {
279
276
  max_characters: preset.chunk_size,
@@ -566,11 +563,9 @@ mod tests {
566
563
  let resolved = config.resolve_preset();
567
564
  assert_eq!(resolved.max_characters, 1024);
568
565
  assert_eq!(resolved.overlap, 100);
569
- assert!(resolved.embedding.is_some());
570
- match &resolved.embedding.unwrap().model {
571
- EmbeddingModelType::Preset { name } => assert_eq!(name, "balanced"),
572
- _ => panic!("Expected Preset model type"),
573
- }
566
+ // Preset configures chunking parameters only; embedding stays None unless
567
+ // the caller explicitly provided one (#797).
568
+ assert!(resolved.embedding.is_none());
574
569
  }
575
570
 
576
571
  #[test]
@@ -684,4 +679,69 @@ mod tests {
684
679
  _ => panic!("Expected Custom variant"),
685
680
  }
686
681
  }
682
+
683
+ // --- Issue #797 regression tests ---
684
+
685
+ /// Preset with no explicit embedding: embedding must remain None.
686
+ ///
687
+ /// Before the fix, `resolve_preset()` would silently inject an
688
+ /// `EmbeddingConfig` whenever a preset was configured, causing every
689
+ /// chunk to have an unexpected `.embedding` field populated.
690
+ #[test]
691
+ #[cfg(feature = "embeddings")]
692
+ fn test_resolve_preset_does_not_inject_embedding_when_none() {
693
+ let config = ChunkingConfig {
694
+ preset: Some("multilingual".to_string()),
695
+ embedding: None,
696
+ ..Default::default()
697
+ };
698
+ let resolved = config.resolve_preset();
699
+ assert!(
700
+ resolved.embedding.is_none(),
701
+ "preset alone must not inject an EmbeddingConfig (#797)"
702
+ );
703
+ }
704
+
705
+ /// Preset with an explicit embedding: the embedding must be preserved unchanged.
706
+ #[test]
707
+ #[cfg(feature = "embeddings")]
708
+ fn test_resolve_preset_preserves_explicit_embedding_config() {
709
+ let explicit = EmbeddingConfig {
710
+ model: EmbeddingModelType::Custom {
711
+ model_id: "my-org/model".to_string(),
712
+ dimensions: 768,
713
+ },
714
+ batch_size: 16,
715
+ ..Default::default()
716
+ };
717
+ let config = ChunkingConfig {
718
+ preset: Some("multilingual".to_string()),
719
+ embedding: Some(explicit),
720
+ ..Default::default()
721
+ };
722
+ let resolved = config.resolve_preset();
723
+ let emb = resolved
724
+ .embedding
725
+ .expect("explicit embedding must survive resolve_preset");
726
+ assert_eq!(emb.batch_size, 16);
727
+ match emb.model {
728
+ EmbeddingModelType::Custom { model_id, dimensions } => {
729
+ assert_eq!(model_id, "my-org/model");
730
+ assert_eq!(dimensions, 768);
731
+ }
732
+ other => panic!("expected Custom model type, got {other:?}"),
733
+ }
734
+ }
735
+
736
+ /// No preset, no embedding: embedding must stay None (regression guard).
737
+ #[test]
738
+ fn test_resolve_preset_no_preset_no_embedding_stays_none() {
739
+ let config = ChunkingConfig {
740
+ preset: None,
741
+ embedding: None,
742
+ ..Default::default()
743
+ };
744
+ let resolved = config.resolve_preset();
745
+ assert!(resolved.embedding.is_none(), "no-preset path must not touch embedding");
746
+ }
687
747
  }
@@ -64,11 +64,16 @@ where
64
64
  }
65
65
 
66
66
  /// Run a single extraction task with semaphore gating, timing, optional timeout, and batch mode.
67
+ ///
68
+ /// When `cancel_token` is provided and the timeout fires, the token is signalled so that
69
+ /// any blocking pdfium operations in progress can observe the cancellation at the next
70
+ /// inter-page checkpoint and stop early.
67
71
  #[cfg(feature = "tokio-runtime")]
68
72
  async fn run_timed_extraction<F, Fut>(
69
73
  index: usize,
70
74
  semaphore: Arc<tokio::sync::Semaphore>,
71
75
  timeout_secs: Option<u64>,
76
+ cancel_token: Option<crate::cancellation::CancellationToken>,
72
77
  extract_fn: F,
73
78
  ) -> (usize, Result<ExtractionResult>, u64)
74
79
  where
@@ -84,6 +89,11 @@ where
84
89
  Some(secs) => match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
85
90
  Ok(inner) => inner,
86
91
  Err(_elapsed) => {
92
+ // Signal the cancellation token so that any blocking pdfium thread can
93
+ // detect it at the next inter-page checkpoint and stop processing.
94
+ if let Some(ref token) = cancel_token {
95
+ token.cancel();
96
+ }
87
97
  let elapsed_ms = start.elapsed().as_millis() as u64;
88
98
  Err(KreuzbergError::Timeout {
89
99
  elapsed_ms,
@@ -200,7 +210,8 @@ pub async fn batch_extract_file(
200
210
  let (ref path, ref file_config) = items[index];
201
211
  let resolved = resolve_config(&cfg, file_config);
202
212
  let timeout = resolved.extraction_timeout_secs;
203
- run_timed_extraction(index, sem, timeout, || {
213
+ let cancel_token = resolved.cancel_token.clone();
214
+ run_timed_extraction(index, sem, timeout, cancel_token, || {
204
215
  let path = path.clone();
205
216
  async move { extract_file(&path, None, &resolved).await }
206
217
  })
@@ -301,7 +312,8 @@ pub async fn batch_extract_bytes(
301
312
  let (bytes, mime_type, file_config) = slots[index].lock().take().expect("batch item already consumed");
302
313
  let resolved = resolve_config(&cfg, &file_config);
303
314
  let timeout = resolved.extraction_timeout_secs;
304
- run_timed_extraction(index, sem, timeout, || async move {
315
+ let cancel_token = resolved.cancel_token.clone();
316
+ run_timed_extraction(index, sem, timeout, cancel_token, || async move {
305
317
  extract_bytes(&bytes, &mime_type, &resolved).await
306
318
  })
307
319
  .await