RubyGems - kreuzberg - Versions diffs - 4.9.1 → 4.9.4 - Mend

kreuzberg 4.9.1 → 4.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +15 -15
data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
data/ext/kreuzberg_rb/native/src/config/types.rs +7 -0
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +5 -5
data/vendor/kreuzberg/Cargo.toml +4 -4
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/chunking/semantic/mod.rs +132 -19
data/vendor/kreuzberg/src/core/config/extraction/types.rs +53 -0
data/vendor/kreuzberg/src/core/config/ocr.rs +33 -35
data/vendor/kreuzberg/src/core/config/processing.rs +7 -5
data/vendor/kreuzberg/src/core/extractor/batch.rs +14 -2
data/vendor/kreuzberg/src/extraction/docx/mod.rs +102 -413
data/vendor/kreuzberg/src/extraction/docx/parser.rs +91 -4
data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -3
data/vendor/kreuzberg/src/extraction/pst.rs +111 -4
data/vendor/kreuzberg/src/extractors/doc.rs +6 -1
data/vendor/kreuzberg/src/extractors/docx.rs +21 -26
data/vendor/kreuzberg/src/extractors/excel.rs +3 -0
data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +6 -1
data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +6 -1
data/vendor/kreuzberg/src/extractors/iwork/pages.rs +6 -1
data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +32 -1
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +26 -5
data/vendor/kreuzberg/src/extractors/ppt.rs +6 -1
data/vendor/kreuzberg/src/layout/model_manager.rs +10 -0
data/vendor/kreuzberg/src/llm/client.rs +26 -6
data/vendor/kreuzberg/src/llm/vlm_ocr.rs +49 -3
data/vendor/kreuzberg/src/pdf/structure/adapters.rs +40 -1
data/vendor/kreuzberg/src/pdf/structure/assembly.rs +32 -0
data/vendor/kreuzberg/src/pdf/structure/bridge.rs +21 -0
data/vendor/kreuzberg/src/pdf/structure/content_convert.rs +31 -6
data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +735 -114
data/vendor/kreuzberg/src/pdf/structure/regions/tables.rs +24 -0
data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +114 -12
data/vendor/kreuzberg/tests/api_consistency.rs +1 -0
data/vendor/kreuzberg/tests/config_loading_tests.rs +7 -5
data/vendor/kreuzberg/tests/llm_integration.rs +3 -3
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +2 -0
data/vendor/kreuzberg-ffi/Cargo.toml +5 -3
data/vendor/kreuzberg-ffi/kreuzberg.h +4 -4
data/vendor/kreuzberg-ffi/src/config/loader.rs +5 -0
data/vendor/kreuzberg-ffi/src/config/merge.rs +1 -0
data/vendor/kreuzberg-ffi/src/config/mod.rs +8 -4
data/vendor/kreuzberg-ffi/src/config/serialize.rs +2 -0
data/vendor/kreuzberg-ffi/src/config_builder.rs +3 -0
data/vendor/kreuzberg-ffi/src/error.rs +9 -8
data/vendor/kreuzberg-ffi/src/lib.rs +5 -1
data/vendor/kreuzberg-ffi/tests/c/test_error.c +4 -1
data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 559b3104e6e21f2f14a92949d427703b51ea8b35b7a643d8964b6953785aa6e1
-  data.tar.gz: bfce92579c45ecba0da0d8e1f077ecca0bb9dd6a1e96c950e8beb8d0a39b5884
+  metadata.gz: 3d8a203168595f6b316a165f500818abed75d89c7a82c46b5b20df996a4bb841
+  data.tar.gz: 28fd19fecd9b18597f17a783923ec3ec08cfa7b99612fec1ca8790aa5cdddbdc
 SHA512:
-  metadata.gz: 1ea8af57d65eb5008126758041df2bfe07acca9d47ebfbf9c9de79f12b5d5ff2336d55b643268d1ea420db1825eaf9bfef6e5deb7335bd2449e9ccb62800492d
-  data.tar.gz: '019f2abaa7dcaf2b91925f7ed0b5332ce569a41a5ae1f152d698423cce1276ea68e177c2088b4377c369943e235c70584f61a76da5600d6dd8b3bd075bc266ab'
+  metadata.gz: af522bff519c1082396d9a6a9480a088693791a4f50818fd1d233726082675559a3144f7758b0ee217b18cdbb1cd08236ecbb332f68c4186a26aa69b83454392
+  data.tar.gz: 4109e6dbc32c5fed518ba84940a7bc732553a178d3b67b69dad6eff5b998aad12b0a53cfe6c6d0784848cbb484e7b79a7abe9154f23d2100786f38414d8286c0

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.1" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.4" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/ext/kreuzberg_rb/native/Cargo.lock CHANGED Viewed

@@ -2127,9 +2127,9 @@ dependencies = [
 [[package]]
 name = "html-to-markdown-rs"
-version = "3.2.5"
+version = "3.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcb619abe81160bba2e2185823e10f6c0793220a266f16791aa715287de322cd"
+checksum = "bc4b9f5076d013aac34a0369c73035cf68f3d9e0771ce96a99e5a02e7e3bf9d4"
 dependencies = [
  "ahash",
  "astral-tl",
@@ -2916,7 +2916,7 @@ dependencies = [
 [[package]]
 name = "kreuzberg-rb"
-version = "4.9.1"
+version = "4.9.3"
 dependencies = [
  "async-trait",
  "html-to-markdown-rs",
@@ -3040,9 +3040,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
 [[package]]
 name = "liter-llm"
-version = "1.2.1"
+version = "1.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1884be380e45da823105c85ef0fa188af81d57be7de9b65016576e1774fdd5f8"
+checksum = "4e4ce5d2d0b09f2e63537ba40b15b0a95c2d6818ed0454eb04d9593ba4a0cad3"
 dependencies = [
  "base64 0.22.1",
  "bytes",
@@ -3634,9 +3634,9 @@ dependencies = [
 [[package]]
 name = "openssl"
-version = "0.10.77"
+version = "0.10.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfe4646e360ec77dff7dde40ed3d6c5fee52d156ef4a62f53973d38294dad87f"
+checksum = "f38c4372413cdaaf3cc79dd92d29d7d9f5ab09b51b10dded508fb90bb70b9222"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -3666,9 +3666,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
 [[package]]
 name = "openssl-sys"
-version = "0.9.113"
+version = "0.9.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad2f2c0eba47118757e4c6d2bff2838f3e0523380021356e7875e858372ce644"
+checksum = "13ce1245cd07fcc4cfdb438f7507b0c7e4f3849a69fd84d52374c66d83741bb6"
 dependencies = [
  "cc",
  "libc",
@@ -4619,9 +4619,9 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
 [[package]]
 name = "rustls-webpki"
-version = "0.103.12"
+version = "0.103.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06"
+checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
 dependencies = [
  "aws-lc-rs",
  "ring",
@@ -5506,7 +5506,7 @@ dependencies = [
  "toml_datetime 1.1.1+spec-1.1.0",
  "toml_parser",
  "toml_writer",
- "winnow 1.0.1",
+ "winnow 1.0.2",
 ]
 [[package]]
@@ -5533,7 +5533,7 @@ version = "1.1.2+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526"
 dependencies = [
- "winnow 1.0.1",
+ "winnow 1.0.2",
 ]
 [[package]]
@@ -6577,9 +6577,9 @@ checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
 [[package]]
 name = "winnow"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5"
+checksum = "2ee1708bef14716a11bae175f579062d4554d95be2c6829f518df847b7b3fdd0"
 [[package]]
 name = "wit-bindgen"

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-rb"
-version = "4.9.1"
+version = "4.9.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
     "time",
     "io-util",
 ] }
-html-to-markdown-rs = { version = "3.2.5", default-features = false }
+html-to-markdown-rs = { version = "3.2.6", default-features = false }
 [dev-dependencies]
 pretty_assertions = "1.4"

data/ext/kreuzberg_rb/native/src/config/types.rs CHANGED Viewed

@@ -404,6 +404,12 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
         true
     };
+    let max_images_per_page = if let Some(val) = get_kw(ruby, hash, "max_images_per_page") {
+        Some(u32::try_convert(val)?)
+    } else {
+        None
+    };
     let config = ImageExtractionConfig {
         extract_images,
         target_dpi,
@@ -412,6 +418,7 @@ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageEx
         auto_adjust_dpi,
         min_dpi,
         max_dpi,
+        max_images_per_page,
     };
     Ok(config)

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.9.1'
+  VERSION = '4.9.4'
 end

data/vendor/Cargo.toml CHANGED Viewed

@@ -2,7 +2,7 @@
 members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
 [workspace.package]
-version = "4.9.1"
+version = "4.9.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -28,12 +28,12 @@ dbase = "0.7"
 futures = "0.3"
 getrandom = { version = "0.4.2", features = ["wasm_js"] }
 hex = "0.4.3"
-html-to-markdown-rs = { version = "3.2.5", default-features = false }
+html-to-markdown-rs = { version = "3.2.6", default-features = false }
 image = { version = "0.25.10", default-features = false }
 itertools = "0.14"
 js-sys = "0.3"
-kreuzberg = { path = "./crates/kreuzberg", version = "4.9.1", default-features = false }
-kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.1" }
+kreuzberg = { path = "./crates/kreuzberg", version = "4.9.4", default-features = false }
+kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.4" }
 lazy_static = "1.5.0"
 libc = "0.2.185"
 liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
 once_cell = "1.21.4"
 ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
 parking_lot = "0.12.5"
-pdf_oxide = { version = "0.3.34", default-features = false }
+pdf_oxide = { version = "0.3.37", default-features = false }
 pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
 rayon = "1.12.0"
 reqwest = { version = "0.13.2", default-features = false }

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.9.1"
+version = "4.9.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
     "simd",
 ], optional = true }
 hex = "0.4.3"
-html-to-markdown-rs = { version = "3.2.5", default-features = false, features = [
+html-to-markdown-rs = { version = "3.2.6", default-features = false, features = [
     "inline-images",
     "metadata",
 ], optional = true }
@@ -287,7 +287,7 @@ image = { version = "0.25.10", default-features = false, features = [
 ], optional = true }
 indexmap = "2.14.0"
 infer = "0.19.0"
-jotdown = "0.9"
+jotdown = "0.10"
 kamadak-exif = { version = "0.6.1", optional = true }
 kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
 outlook-pst = { version = "1.2.0", optional = true }
 parking_lot = "0.12.5"
 pastey = "0.2"
-pdf_oxide = { version = "0.3.34", default-features = false, optional = true }
+pdf_oxide = { version = "0.3.37", default-features = false, optional = true }
 pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
 pulldown-cmark = { version = "0.13" }
 quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.9.1 Release**
+> **🚀 Version 4.9.4 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/chunking/semantic/mod.rs CHANGED Viewed

@@ -25,10 +25,6 @@ const SEGMENT_SIZE: usize = 200;
 #[cfg(feature = "embeddings")]
 const DEFAULT_TOPIC_THRESHOLD: f32 = 0.75;
-/// Safety ceiling for auto-budget when no embedding model is configured.
-/// Prevents unbounded chunks in header-less documents.
-const AUTO_BUDGET_CEILING: usize = 4000;
 /// Split text into semantically coherent chunks.
 ///
 /// Splits text into fine-grained segments, detects structural (and optionally
@@ -46,6 +42,8 @@ pub fn chunk_semantic(
         });
     }
+    warn_if_fallback_path(config);
     let seg_size = SEGMENT_SIZE;
     let has_markdown_headers = text.lines().any(crate::utils::markdown_utils::is_markdown_header);
     let splitter_segments: Vec<&str> = if has_markdown_headers {
@@ -165,11 +163,33 @@ fn compute_boundaries(_segments: &[Segment<'_>], forced: &[bool], _config: &Chun
     Ok(forced.to_vec())
 }
-/// Resolve the safety ceiling for chunk size.
+/// Warn when the semantic chunker is invoked without an embedding model.
+///
+/// Without an embedding, `chunk_semantic` falls back to a structural-boundary
+/// heuristic (ALL-CAPS headers, numbered sections, blank-line paragraphs).
+/// Topic-similarity chunking requires an embedding model. This warning makes
+/// the fallback mode discoverable to callers who think they're getting
+/// embedding-driven topic detection.
+#[cfg(feature = "embeddings")]
+fn warn_if_fallback_path(config: &ChunkingConfig) {
+    if config.embedding.is_none() {
+        tracing::warn!(
+            "chunker_type='semantic' without an EmbeddingConfig falls back to a \
+             structural-boundary heuristic; topic-similarity chunking requires an \
+             embedding model. Either configure `embedding` or switch to \
+             chunker_type='text'/'markdown' to silence this warning."
+        );
+    }
+}
+#[cfg(not(feature = "embeddings"))]
+fn warn_if_fallback_path(_config: &ChunkingConfig) {}
+/// Resolve the size ceiling for merged chunks.
 ///
-/// When an embedding preset is configured, use its chunk_size as the ceiling
-/// (chunks must fit in the model's context window). Otherwise use a generous
-/// default that prevents unbounded chunks in header-less documents.
+/// When an embedding preset is configured, use its `chunk_size` so chunks fit
+/// in the model's context window. Otherwise honor the caller's configured
+/// `max_characters`.
 fn resolve_ceiling(config: &ChunkingConfig) -> usize {
     #[cfg(feature = "embeddings")]
     if let Some(ref emb) = config.embedding
@@ -178,8 +198,7 @@ fn resolve_ceiling(config: &ChunkingConfig) -> usize {
     {
         return size;
     }
-    let _ = config;
-    AUTO_BUDGET_CEILING
+    config.max_characters
 }
 #[cfg(test)]
@@ -306,30 +325,124 @@ mod tests {
     }
     #[test]
-    fn ceiling_caps_oversized_headerless_text() {
-        // A large block of text with no headers should be split at the ceiling,
-        // not produce one unbounded chunk.
-        let text = "word ".repeat(1500); // ~7500 chars, exceeds AUTO_BUDGET_CEILING
+    fn max_characters_caps_oversized_headerless_text() {
+        // A large block of text with no headers must be split so every chunk
+        // respects the caller's configured max_characters.
+        let text = "word ".repeat(1500); // ~7500 chars
+        let max = 1000;
         let config = ChunkingConfig {
-            max_characters: 1000, // ignored by semantic chunker
+            max_characters: max,
             overlap: 0,
             trim: true,
             chunker_type: ChunkerType::Semantic,
             ..Default::default()
         };
         let result = chunk_semantic(&text, &config, None).unwrap();
-        assert!(result.chunks.len() >= 2, "should split at ceiling, got 1 chunk");
+        assert!(result.chunks.len() >= 2, "should split at max_characters, got 1 chunk");
         for (i, chunk) in result.chunks.iter().enumerate() {
             assert!(
-                chunk.content.chars().count() <= super::AUTO_BUDGET_CEILING + 100,
-                "chunk {} exceeds ceiling: {} > {}",
+                chunk.content.chars().count() <= max,
+                "chunk {} exceeds max_characters: {} > {}",
                 i,
                 chunk.content.chars().count(),
-                super::AUTO_BUDGET_CEILING
+                max
             );
         }
     }
+    #[test]
+    fn max_characters_controls_fallback_chunk_size() {
+        // bb-yq35 repro: with no embedding configured, different max_characters
+        // values must produce different chunking output.
+        let sample = format!(
+            "{}{}{}",
+            "Solar panel efficiency improves. ".repeat(200),
+            "\n\nFDA clinical trials require double-blind. ".repeat(200),
+            "\n\nQuantum entanglement needs cooling. ".repeat(200),
+        );
+        let run = |max: usize| {
+            let config = ChunkingConfig {
+                max_characters: max,
+                overlap: 0,
+                trim: true,
+                chunker_type: ChunkerType::Semantic,
+                ..Default::default()
+            };
+            chunk_semantic(&sample, &config, None).unwrap()
+        };
+        let small = run(500);
+        let large = run(1500);
+        assert!(
+            small.chunks.len() > large.chunks.len(),
+            "smaller max_characters must yield more chunks: small={}, large={}",
+            small.chunks.len(),
+            large.chunks.len()
+        );
+        for chunk in &small.chunks {
+            assert!(
+                chunk.content.chars().count() <= 500,
+                "small chunk exceeds cap: {}",
+                chunk.content.chars().count()
+            );
+        }
+        for chunk in &large.chunks {
+            assert!(
+                chunk.content.chars().count() <= 1500,
+                "large chunk exceeds cap: {}",
+                chunk.content.chars().count()
+            );
+        }
+    }
+    #[cfg(feature = "embeddings")]
+    #[test]
+    fn semantic_without_embedding_warns() {
+        use std::io::Write;
+        use std::sync::{Arc, Mutex};
+        #[derive(Clone, Default)]
+        struct Buf(Arc<Mutex<Vec<u8>>>);
+        impl Write for Buf {
+            fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+                self.0.lock().unwrap().extend_from_slice(buf);
+                Ok(buf.len())
+            }
+            fn flush(&mut self) -> std::io::Result<()> {
+                Ok(())
+            }
+        }
+        impl<'a> tracing_subscriber::fmt::MakeWriter<'a> for Buf {
+            type Writer = Buf;
+            fn make_writer(&'a self) -> Self::Writer {
+                self.clone()
+            }
+        }
+        let buffer = Buf::default();
+        let subscriber = tracing_subscriber::fmt()
+            .with_writer(buffer.clone())
+            .with_max_level(tracing::Level::WARN)
+            .with_ansi(false)
+            .finish();
+        tracing::subscriber::with_default(subscriber, || {
+            let config = ChunkingConfig {
+                chunker_type: ChunkerType::Semantic,
+                ..Default::default()
+            };
+            let _ = chunk_semantic("hello world", &config, None).unwrap();
+        });
+        let captured = String::from_utf8(buffer.0.lock().unwrap().clone()).unwrap();
+        assert!(
+            captured.contains("without an EmbeddingConfig"),
+            "expected fallback warning in captured logs, got: {captured:?}"
+        );
+    }
     #[test]
     fn sections_with_headers_produce_separate_chunks() {
         // Each section has enough content that the segments span multiple paragraphs.

data/vendor/kreuzberg/src/core/config/extraction/types.rs CHANGED Viewed

@@ -40,6 +40,18 @@ pub struct ImageExtractionConfig {
     /// Maximum DPI threshold
     #[serde(default = "default_max_dpi")]
     pub max_dpi: i32,
+    /// Maximum number of image objects to extract per PDF page.
+    ///
+    /// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
+    /// can trigger extremely long or indefinite extraction times when every image
+    /// object on a dense page is decoded individually via pdfium FFI. Setting this
+    /// limit causes kreuzberg to stop collecting individual images once the count
+    /// per page reaches the cap and emit a warning instead.
+    ///
+    /// `None` (default) means no limit — all images are extracted.
+    #[serde(default)]
+    pub max_images_per_page: Option<u32>,
 }
 /// Token reduction configuration.
@@ -98,3 +110,44 @@ fn default_reduction_mode() -> String {
 fn default_confidence() -> f64 {
     0.8
 }
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_max_images_per_page_defaults_none() {
+        let config = ImageExtractionConfig::default();
+        assert_eq!(config.max_images_per_page, None);
+    }
+    #[test]
+    fn test_max_images_per_page_serializes_as_null_when_none() {
+        let config = ImageExtractionConfig::default();
+        let json = serde_json::to_string(&config).unwrap();
+        assert!(json.contains("\"max_images_per_page\":null"));
+    }
+    #[test]
+    fn test_max_images_per_page_roundtrips_via_json() {
+        let config = ImageExtractionConfig {
+            max_images_per_page: Some(50),
+            ..Default::default()
+        };
+        let json = serde_json::to_string(&config).unwrap();
+        let back: ImageExtractionConfig = serde_json::from_str(&json).unwrap();
+        assert_eq!(back.max_images_per_page, Some(50));
+    }
+    /// Regression test for issue #766: missing field in JSON must not break
+    /// deserialization (backwards-compat — existing configs without this key
+    /// must still deserialize cleanly).
+    #[test]
+    fn test_max_images_per_page_absent_in_json_deserializes_as_none() {
+        let json = r#"{"extract_images":true,"target_dpi":300,"max_image_dimension":4096,
+                       "inject_placeholders":true,"auto_adjust_dpi":true,
+                       "min_dpi":72,"max_dpi":600}"#;
+        let config: ImageExtractionConfig = serde_json::from_str(json).unwrap();
+        assert_eq!(config.max_images_per_page, None);
+    }
+}

data/vendor/kreuzberg/src/core/config/ocr.rs CHANGED Viewed

@@ -323,9 +323,12 @@ impl OcrConfig {
     /// Returns the effective pipeline config.
     ///
     /// - If `pipeline` is explicitly set, returns it.
-    /// - If `paddle-ocr` feature is compiled in and no explicit pipeline is set,
-    ///   auto-constructs a default pipeline: primary backend (priority 100) + paddleocr (priority 50).
-    /// - Otherwise returns `None` (single-backend mode, same as today).
+    /// - If `paddle-ocr` is compiled in and the backend is the default
+    ///   (tesseract), auto-constructs `[tesseract @ 100, paddleocr @ 50]`.
+    /// - Otherwise returns `None` (single-backend mode).
+    ///
+    /// Explicit non-default backend selections are honored as-is — a silent
+    /// paddleocr fallback would mask errors from the chosen backend.
     pub fn effective_pipeline(&self) -> Option<OcrPipelineConfig> {
         if self.pipeline.is_some() {
             return self.pipeline.clone();
@@ -333,25 +336,28 @@ impl OcrConfig {
         #[cfg(feature = "paddle-ocr")]
         {
-            let mut stages = vec![OcrPipelineStage {
-                backend: self.backend.clone(),
-                priority: 100,
-                language: None,
-                tesseract_config: self.tesseract_config.clone(),
-                paddle_ocr_config: None,
-                vlm_config: self.vlm_config.clone(),
-            }];
-            // Only add paddleocr fallback if primary backend isn't already paddleocr
-            if self.backend != "paddleocr" {
-                stages.push(OcrPipelineStage {
+            if self.backend != default_tesseract_backend() {
+                return None;
+            }
+            let stages = vec![
+                OcrPipelineStage {
+                    backend: self.backend.clone(),
+                    priority: 100,
+                    language: None,
+                    tesseract_config: self.tesseract_config.clone(),
+                    paddle_ocr_config: None,
+                    vlm_config: self.vlm_config.clone(),
+                },
+                OcrPipelineStage {
                     backend: "paddleocr".to_string(),
                     priority: 50,
                     language: None,
                     tesseract_config: None,
                     paddle_ocr_config: self.paddle_ocr_config.clone(),
                     vlm_config: None,
-                });
-            }
+                },
+            ];
             Some(OcrPipelineConfig {
                 stages,
                 quality_thresholds: self.effective_thresholds(),
@@ -485,29 +491,21 @@ mod tests {
     }
     #[test]
-    fn test_effective_pipeline_paddleocr_backend_no_duplicate() {
-        // When primary backend is "paddleocr", effective_pipeline should NOT add
-        // a second paddleocr stage (issue #6 fix).
+    fn test_effective_pipeline_explicit_paddleocr_no_autofallback() {
         let config = OcrConfig {
             backend: "paddleocr".to_string(),
             ..Default::default()
         };
-        let result = config.effective_pipeline();
-        // With paddle-ocr feature: should have exactly 1 stage (no duplicate)
-        // Without paddle-ocr feature: should be None
-        #[cfg(feature = "paddle-ocr")]
-        {
-            let pipeline = result.unwrap();
-            let paddle_count = pipeline.stages.iter().filter(|s| s.backend == "paddleocr").count();
-            assert_eq!(
-                paddle_count, 1,
-                "Should not have duplicate paddleocr stages, found {paddle_count}"
-            );
-        }
-        #[cfg(not(feature = "paddle-ocr"))]
-        {
-            assert!(result.is_none());
-        }
+        assert!(config.effective_pipeline().is_none());
+    }
+    #[test]
+    fn test_effective_pipeline_explicit_easyocr_no_autofallback() {
+        let config = OcrConfig {
+            backend: "easyocr".to_string(),
+            ..Default::default()
+        };
+        assert!(config.effective_pipeline().is_none());
     }
     #[test]

data/vendor/kreuzberg/src/core/config/processing.rs CHANGED Viewed

@@ -14,11 +14,13 @@ use std::path::PathBuf;
 /// * `Text` - Generic text splitter, splits on whitespace and punctuation
 /// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
 /// * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
-/// * `Semantic` - Topic-aware chunker that splits at natural document boundaries
-///   (headers, paragraph breaks, topic shifts). Works out of the box with no extra
-///   configuration. Optionally add an `EmbeddingConfig` for embedding-based topic
-///   detection; `topic_threshold` (default 0.75) and `max_characters` (default 1000)
-///   are automatically applied when not specified.
+/// * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
+///   embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
+///   lower = more splits). Without an embedding, falls back to a
+///   structural-boundary heuristic (ALL-CAPS headers, numbered sections,
+///   blank-line paragraphs) and merges groups into chunks capped at
+///   `max_characters` (default 1000). `topic_threshold` has no effect in the
+///   fallback path. For best results, pair with an embedding model.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
 #[serde(rename_all = "lowercase")]
 pub enum ChunkerType {