kreuzberg 4.9.8 → 4.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  4. data/lib/kreuzberg/version.rb +1 -1
  5. data/vendor/Cargo.toml +7 -8
  6. data/vendor/kreuzberg/Cargo.toml +22 -21
  7. data/vendor/kreuzberg/README.md +1 -1
  8. data/vendor/kreuzberg/src/core/config/pdf.rs +2 -5
  9. data/vendor/kreuzberg/src/core/extractor/bytes.rs +6 -1
  10. data/vendor/kreuzberg/src/core/extractor/file.rs +6 -1
  11. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -0
  12. data/vendor/kreuzberg/src/core/pipeline/features.rs +115 -15
  13. data/vendor/kreuzberg/src/embeddings/mod.rs +17 -13
  14. data/vendor/kreuzberg/src/extraction/email.rs +58 -7
  15. data/vendor/kreuzberg/src/extraction/image_ocr.rs +72 -0
  16. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +0 -168
  17. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +1 -410
  18. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +41 -15
  19. data/vendor/kreuzberg/src/pdf/images.rs +22 -4
  20. data/vendor/kreuzberg/src/pdf/mod.rs +0 -16
  21. data/vendor/kreuzberg/src/pdf/rendering.rs +53 -6
  22. data/vendor/kreuzberg/src/pdf/structure/mod.rs +0 -2
  23. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +12 -890
  24. data/vendor/kreuzberg/src/table_core.rs +8 -1
  25. data/vendor/kreuzberg/tests/extraction_timeout_tests.rs +26 -0
  26. data/vendor/kreuzberg/tests/pdf_markdown_quality.rs +1 -2
  27. data/vendor/kreuzberg-ffi/Cargo.toml +5 -5
  28. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  29. data/vendor/kreuzberg-ffi/src/config/loader.rs +39 -24
  30. data/vendor/kreuzberg-ffi/src/config/mod.rs +0 -4
  31. data/vendor/kreuzberg-ffi/src/lib.rs +0 -1
  32. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +2 -2
  33. data/vendor/kreuzberg-paddle-ocr/src/ocr_utils.rs +3 -3
  34. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  35. data/vendor/kreuzberg-tesseract/Cargo.toml +4 -4
  36. metadata +2 -10
  37. data/vendor/kreuzberg/src/pdf/oxide/annotations.rs +0 -258
  38. data/vendor/kreuzberg/src/pdf/oxide/hierarchy.rs +0 -235
  39. data/vendor/kreuzberg/src/pdf/oxide/images.rs +0 -53
  40. data/vendor/kreuzberg/src/pdf/oxide/metadata.rs +0 -381
  41. data/vendor/kreuzberg/src/pdf/oxide/mod.rs +0 -43
  42. data/vendor/kreuzberg/src/pdf/oxide/table.rs +0 -247
  43. data/vendor/kreuzberg/src/pdf/oxide/text.rs +0 -250
  44. data/vendor/kreuzberg/src/pdf/oxide_text.rs +0 -122
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b634075425816167cdb132080aa31a6c5c561badbd154de17896866bdf88ba6e
4
- data.tar.gz: 4a8d041bad2da842a676b2e2358ba44d3269323e820728bdb568ebc9171adee1
3
+ metadata.gz: de92334e109bbca1bdd22469a651f146bf29eee730dfc841c3dcb4703ee3ba5b
4
+ data.tar.gz: 53140e24511ff0910814325859b3e7382ee9f511e2412181812607f0f3516f33
5
5
  SHA512:
6
- metadata.gz: 60c018a882054c23b629ee1d0692493c25c78a52e106fc7f25e3c22d7cdcfd36341f5aaeebea3d6be48d1425550914124daaeaeeb02bbd84bbb64cd6afedad59
7
- data.tar.gz: 62d942f23dae32120184cd5b011275a2556b3e112a866128bd290a67ddbbd94c523ec2eaecd24589911264609ae612fd22eb51f3efc2e9eec4b1d08f8bd6823c
6
+ metadata.gz: 80b7a6fa716b1adf28d543074581d5f88984aae738669e52116c49d301b1116ed7311c4bcfa90d1853a9d98752b0d2ad13b5c0e14e9f2623217ff319c007eca2
7
+ data.tar.gz: fab95d4048b382cf3ff4e154d7979bc4963e35e57c802cd5f871950d460ef16c5745c66931fbbf15b02a479f0a991a9e1832a126d445c002c7bf79b92ae143b9
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.8" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.9" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.9.8"
3
+ version = "4.9.9"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -52,7 +52,7 @@ magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb51742
52
52
  rb-sys = { version = "0.9.128", default-features = false, features = [
53
53
  "stable-api-compiled-fallback",
54
54
  ] }
55
- serde_json = "1.0.149"
55
+ serde_json = "1.0.150"
56
56
  toml = "1.1.2"
57
57
  serde_yaml_ng = "0.10"
58
58
  tokio = { version = "1.52.3", features = [
@@ -65,7 +65,7 @@ tokio = { version = "1.52.3", features = [
65
65
  "time",
66
66
  "io-util",
67
67
  ] }
68
- html-to-markdown-rs = { version = "3.4.1", default-features = false }
68
+ html-to-markdown-rs = { version = "3.5.7", default-features = false }
69
69
 
70
70
  [dev-dependencies]
71
71
  pretty_assertions = "1.4"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.9.8'
4
+ VERSION = '4.9.9'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.9.8"
5
+ version = "4.9.9"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -28,29 +28,28 @@ dbase = "0.7"
28
28
  futures = "0.3"
29
29
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
30
30
  hex = "0.4.3"
31
- html-to-markdown-rs = { version = "3.4.1", default-features = false }
31
+ html-to-markdown-rs = { version = "3.5.7", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.9.8", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.8" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.9.9", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.9" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.186"
39
39
  liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false }
40
40
  log = "0.4"
41
- lzma-rust2 = { version = "0.16.2" }
41
+ lzma-rust2 = { version = "0.16.4" }
42
42
  memmap2 = "0.9"
43
43
  minijinja = "2"
44
44
  num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.49", default-features = false }
49
48
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
49
  rayon = "1.12.0"
51
- reqwest = { version = "0.13.3", default-features = false }
50
+ reqwest = { version = "0.13.4", default-features = false }
52
51
  serde = { version = "1.0.228", features = ["derive"] }
53
- serde_json = { version = "1.0.149" }
52
+ serde_json = { version = "1.0.150" }
54
53
  serde_toon_format = "0.1"
55
54
  tempfile = "3.27.0"
56
55
  thiserror = "2.0.18"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.9.8"
3
+ version = "4.9.9"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -39,10 +39,6 @@ simd-utf8 = ["dep:simdutf8"]
39
39
  tokio-runtime = ["dep:tokio"]
40
40
 
41
41
  pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image", "dep:flate2", "html"]
42
- # Experimental: use pdf_oxide for text extraction (pure Rust, no C++ deps).
43
- # Provides cleaner word spacing for PDFs with broken font CMaps.
44
- # Requires 'pdf' feature. Not included in 'full' — opt-in only.
45
- pdf-oxide = ["pdf", "dep:pdf_oxide"]
46
42
  static-pdfium = ["pdf"]
47
43
  bundled-pdfium = ["pdf"]
48
44
  system-pdfium = ["pdf"]
@@ -61,7 +57,14 @@ office = [
61
57
  ]
62
58
  hwp = ["dep:cfb", "dep:flate2"]
63
59
  iwork = ["dep:zip", "dep:snap"]
64
- email = ["dep:mail-parser", "dep:cfb", "dep:outlook-pst", "dep:tempfile", "dep:chrono"]
60
+ email = [
61
+ "dep:mail-parser",
62
+ "dep:cfb",
63
+ "dep:outlook-pst",
64
+ "dep:tempfile",
65
+ "dep:chrono",
66
+ "dep:chardetng",
67
+ ]
65
68
  html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"]
66
69
  xml = ["dep:quick-xml", "dep:roxmltree"]
67
70
  archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
@@ -259,7 +262,7 @@ cfb = { version = "0.14", optional = true }
259
262
  chardetng = { version = "1.0.0", optional = true }
260
263
  chrono = { version = "0.4", optional = true }
261
264
  comrak = { version = "0.52", default-features = false }
262
- dashmap = "6.1"
265
+ dashmap = "6.2"
263
266
  dbase = { version = "0.7", optional = true }
264
267
  dirs = "6"
265
268
  encoding_rs = { version = "0.8.35" }
@@ -271,7 +274,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
271
274
  "simd",
272
275
  ], optional = true }
273
276
  hex = "0.4.3"
274
- html-to-markdown-rs = { version = "3.4.1", default-features = false, features = [
277
+ html-to-markdown-rs = { version = "3.5.7", default-features = false, features = [
275
278
  "inline-images",
276
279
  "metadata",
277
280
  ], optional = true }
@@ -294,9 +297,9 @@ kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
294
297
  libc = "0.2.186"
295
298
  liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false, optional = true }
296
299
  log = "0.4"
297
- lopdf = { version = "0.40.0", optional = true }
300
+ lopdf = { version = "0.41.0", optional = true }
298
301
  mail-parser = { version = "0.11.3", optional = true }
299
- memchr = "2.8.0"
302
+ memchr = "2.8.1"
300
303
  memmap2 = "0.9"
301
304
  mime_guess = "2.0"
302
305
  minijinja = { version = "2", optional = true }
@@ -314,7 +317,6 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
317
  outlook-pst = { version = "1.2.0", optional = true }
315
318
  parking_lot = "0.12.5"
316
319
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.49", default-features = false, optional = true }
318
320
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
321
  pulldown-cmark = { version = "0.13" }
320
322
  quick-xml = { version = "0.40.1", features = ["serialize"], optional = true }
@@ -333,22 +335,21 @@ rmp-serde = "1.3"
333
335
 
334
336
  roxmltree = { version = "0.21.1", optional = true }
335
337
  serde = { version = "1.0.228", features = ["derive"] }
336
- serde_json = { version = "1.0.149" }
338
+ serde_json = { version = "1.0.150" }
337
339
  serde_toon_format = "0.1"
338
340
  serde_yaml_ng = "0.10.0"
339
341
  sevenz-rust2 = { version = "0.20.2", optional = true }
340
342
  sha2 = { version = "0.11", optional = true }
341
343
  simdutf8 = { version = "0.1", optional = true }
342
344
  snap = { version = "1.1", optional = true }
343
- tar = { version = "0.4.45", optional = true }
345
+ tar = { version = "0.4.46", optional = true }
344
346
  tempfile = { version = "3.27.0", optional = true }
345
- text-splitter = { version = "0.30.1", features = ["markdown"], optional = true }
347
+ text-splitter = { version = "0.31.0", features = ["markdown"], optional = true }
346
348
  thiserror = "2.0.18"
347
349
  tiff = { version = "0.11", optional = true }
348
- # Pinned to 0.22 text-splitter 0.30.1 ChunkSizer impl + embeddings/add_special_tokens
349
- # break against tokenizers 0.23. Bump deliberately on the next minor with a coordinated
350
- # text-splitter upgrade. Tracked under issue #991 / 4.9.8 release.
351
- tokenizers = { version = "=0.22.2", optional = true, default-features = false, features = [
350
+ # Keep aligned with text-splitter's optional tokenizers integration so ChunkSizer
351
+ # is implemented for the same Tokenizer type used by Kreuzberg.
352
+ tokenizers = { version = "0.23.1", optional = true, default-features = false, features = [
352
353
  "http",
353
354
  "fancy-regex",
354
355
  ] }
@@ -357,7 +358,7 @@ toml = "1.1.2"
357
358
  tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
358
359
  tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
359
360
  tracing = "0.1"
360
- tracing-opentelemetry = { version = "0.32", optional = true }
361
+ tracing-opentelemetry = { version = "0.33", optional = true }
361
362
  unicode-normalization = { version = "0.1.25", optional = true }
362
363
  urlencoding = "2"
363
364
  utoipa = { version = "5.5", features = ["axum_extras"], optional = true }
@@ -411,8 +412,8 @@ dotenvy = "0.15"
411
412
  filetime = "0.2"
412
413
  image = { version = "0.25.10", default-features = false, features = ["png"] }
413
414
  jsonschema = "0.46"
414
- serial_test = "3.4.0"
415
- tar = "0.4.45"
415
+ serial_test = "3.5.0"
416
+ tar = "0.4.46"
416
417
  tempfile = "3.27.0"
417
418
  tokio = { version = "1.52.3", features = ["macros", "time"] }
418
419
  tokio-test = "0.4"
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.9.8 Release**
21
+ > **🚀 Version 4.9.9 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -8,17 +8,14 @@ use serde::{Deserialize, Serialize};
8
8
  /// PDF extraction backend selection.
9
9
  ///
10
10
  /// Controls which PDF library is used for text extraction:
11
- /// - `Pdfium`: pdfium-render (default, C++ based, mature)
12
- /// - `PdfOxide`: pdf_oxide (pure Rust, faster, requires `pdf-oxide` feature)
13
- /// - `Auto`: automatically select based on available features
11
+ /// - `Pdfium`: pdfium-render (default, mature)
12
+ /// - `Auto`: automatically select the default available backend
14
13
  #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
15
14
  #[serde(rename_all = "lowercase")]
16
15
  pub enum PdfBackend {
17
16
  /// Use pdfium-render backend (default).
18
17
  #[default]
19
18
  Pdfium,
20
- /// Use pdf_oxide backend (pure Rust). Requires `pdf-oxide` feature.
21
- PdfOxide,
22
19
  /// Automatically select the best available backend.
23
20
  Auto,
24
21
  }
@@ -128,7 +128,12 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
128
128
 
129
129
  #[cfg(not(feature = "tokio-runtime"))]
130
130
  let result = {
131
- let _ = config.extraction_timeout_secs;
131
+ if config.extraction_timeout_secs.is_some() {
132
+ return Err(crate::KreuzbergError::Validation {
133
+ message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
134
+ source: None,
135
+ });
136
+ }
132
137
  extraction_future.await
133
138
  };
134
139
 
@@ -142,7 +142,12 @@ pub async fn extract_file(
142
142
 
143
143
  #[cfg(not(feature = "tokio-runtime"))]
144
144
  let result = {
145
- let _ = config.extraction_timeout_secs;
145
+ if config.extraction_timeout_secs.is_some() {
146
+ return Err(crate::KreuzbergError::Validation {
147
+ message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
148
+ source: None,
149
+ });
150
+ }
146
151
  extraction_future.await
147
152
  };
148
153
 
@@ -35,6 +35,13 @@ pub(super) fn extract_bytes_sync_impl(
35
35
  let cfg = config.cloned().unwrap_or_default();
36
36
  let cfg = cfg.normalized().into_owned();
37
37
 
38
+ if cfg.extraction_timeout_secs.is_some() {
39
+ return Err(crate::KreuzbergError::Validation {
40
+ message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
41
+ source: None,
42
+ });
43
+ }
44
+
38
45
  let validated_mime = if let Some(mime) = mime_type {
39
46
  if mime == "application/octet-stream" {
40
47
  mime::detect_mime_type_from_bytes(content)?
@@ -37,10 +37,19 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
37
37
  continue;
38
38
  }
39
39
 
40
- // Try exact match first
41
- if let Some(pos) = content[search_offset..].find(&page.content) {
40
+ let normalized: String = page
41
+ .content
42
+ .split("\n\n")
43
+ .map(str::trim)
44
+ .filter(|s| !s.is_empty())
45
+ .collect::<Vec<_>>()
46
+ .join("\n\n");
47
+
48
+ // Try normalized exact match first. PDF page text can contain trailing
49
+ // spaces that render_plain strips before chunking.
50
+ if let Some(pos) = content[search_offset..].find(normalized.as_str()) {
42
51
  let byte_start = search_offset + pos;
43
- let byte_end = content.floor_char_boundary(byte_start + page.content.len());
52
+ let byte_end = content.floor_char_boundary(byte_start + normalized.len());
44
53
  boundaries.push(PageBoundary {
45
54
  page_number: page.page_number,
46
55
  byte_start,
@@ -50,12 +59,12 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
50
59
  continue;
51
60
  }
52
61
 
53
- // Fallback: search for first non-empty line of page content
62
+ // Fallback: search for first non-empty line of page content.
54
63
  if let Some(line) = page.content.lines().find(|l| !l.trim().is_empty()).map(|l| l.trim())
55
64
  && let Some(pos) = content[search_offset..].find(line)
56
65
  {
57
66
  let byte_start = search_offset + pos;
58
- let raw_end = (byte_start + page.content.len()).min(content.len());
67
+ let raw_end = (byte_start + normalized.len()).min(content.len());
59
68
  let byte_end = content.floor_char_boundary(raw_end);
60
69
  boundaries.push(PageBoundary {
61
70
  page_number: page.page_number,
@@ -176,25 +185,27 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
176
185
  let resolved_config = chunking_config.resolve_preset();
177
186
  let chunking_config = &resolved_config;
178
187
 
179
- // Recompute page boundaries against `result.content` (rendered by `render_plain`)
180
- // if per-page content is available. The boundaries stored in
181
- // `result.metadata.pages.boundaries` were computed against the raw extractor text
182
- // and may have different byte offsets than the rendered content (fix for #636).
188
+ let (chunk_input, heading_source) = if config.output_format != crate::core::config::OutputFormat::Plain {
189
+ (
190
+ result.formatted_content.as_deref().unwrap_or(result.content.as_str()),
191
+ None,
192
+ )
193
+ } else {
194
+ (result.content.as_str(), result.formatted_content.as_deref())
195
+ };
196
+
183
197
  let recomputed_boundaries: Option<Vec<PageBoundary>> = result
184
198
  .pages
185
199
  .as_deref()
186
- .map(|pages| recompute_boundaries_from_pages(&result.content, pages));
200
+ .map(|pages| recompute_boundaries_from_pages(chunk_input, pages))
201
+ .filter(|boundaries| !boundaries.is_empty());
187
202
 
188
203
  let page_boundaries: Option<&[PageBoundary]> = recomputed_boundaries
189
204
  .as_deref()
190
205
  .or_else(|| result.metadata.pages.as_ref().and_then(|ps| ps.boundaries.as_deref()));
191
206
 
192
- // Pass formatted_content (markdown) for heading context resolution when available.
193
- // Plain-text rendering strips heading markers, but the markdown chunker needs them
194
- // to build the heading hierarchy for chunk metadata.
195
- let heading_source = result.formatted_content.as_deref();
196
207
  match crate::chunking::chunk_text_with_heading_source(
197
- &result.content,
208
+ chunk_input,
198
209
  chunking_config,
199
210
  page_boundaries,
200
211
  heading_source,
@@ -314,3 +325,92 @@ pub(super) fn execute_token_reduction(result: &mut ExtractionResult, config: &Ex
314
325
 
315
326
  Ok(())
316
327
  }
328
+
329
+ #[cfg(test)]
330
+ #[cfg(feature = "chunking")]
331
+ mod tests {
332
+ use super::*;
333
+ use crate::core::config::{ChunkerType, ChunkingConfig, OutputFormat};
334
+ use crate::types::PageContent;
335
+
336
+ fn make_page(page_number: usize, content: &str) -> PageContent {
337
+ PageContent {
338
+ page_number,
339
+ content: content.to_string(),
340
+ tables: Vec::new(),
341
+ images: Vec::new(),
342
+ hierarchy: None,
343
+ is_blank: None,
344
+ layout_regions: None,
345
+ }
346
+ }
347
+
348
+ fn markdown_chunking_config() -> ExtractionConfig {
349
+ ExtractionConfig {
350
+ output_format: OutputFormat::Markdown,
351
+ chunking: Some(ChunkingConfig {
352
+ max_characters: 2000,
353
+ overlap: 0,
354
+ trim: true,
355
+ chunker_type: ChunkerType::Markdown,
356
+ ..Default::default()
357
+ }),
358
+ ..Default::default()
359
+ }
360
+ }
361
+
362
+ #[test]
363
+ fn chunks_content_is_markdown_when_output_format_is_markdown() {
364
+ let mut result = ExtractionResult {
365
+ content: "SH-001 Luca Bianchi Common Germany 3500000".to_string(),
366
+ formatted_content: Some("| SH-001 | Luca Bianchi | Common | Germany | 3,500,000 |".to_string()),
367
+ mime_type: Cow::Borrowed("application/pdf"),
368
+ ..Default::default()
369
+ };
370
+
371
+ execute_chunking(&mut result, &markdown_chunking_config()).unwrap();
372
+
373
+ let chunks = result.chunks.expect("chunks must be populated");
374
+ assert!(!chunks.is_empty());
375
+ assert!(chunks.iter().any(|chunk| chunk.content.contains('|')));
376
+ assert!(chunks.iter().all(|chunk| !chunk.content.starts_with("SH-001 Luca")));
377
+ assert!(result.formatted_content.is_some());
378
+ }
379
+
380
+ #[test]
381
+ fn markdown_chunks_preserve_page_metadata_when_formatted_pages_match() {
382
+ let mut result = ExtractionResult {
383
+ content: "Page one text\n\nPage two text".to_string(),
384
+ formatted_content: Some("# Page one\n\nPage one text\n\n# Page two\n\nPage two text".to_string()),
385
+ pages: Some(vec![make_page(1, "Page one text"), make_page(2, "Page two text")]),
386
+ mime_type: Cow::Borrowed("application/pdf"),
387
+ ..Default::default()
388
+ };
389
+
390
+ execute_chunking(&mut result, &markdown_chunking_config()).unwrap();
391
+
392
+ let chunks = result.chunks.expect("chunks must be populated");
393
+ assert!(!chunks.is_empty());
394
+ assert!(chunks.iter().any(|chunk| chunk.metadata.first_page.is_some()));
395
+ assert!(chunks.iter().any(|chunk| chunk.metadata.last_page.is_some()));
396
+ }
397
+
398
+ #[test]
399
+ fn recompute_boundaries_trailing_space_pages_all_resolve() {
400
+ let p1_raw = "Heading \n\nBody paragraph one. ";
401
+ let p2_raw = "Second heading \n\nBody paragraph two. ";
402
+ let p3_raw = "Conclusion. ";
403
+ let p1_norm = "Heading\n\nBody paragraph one.";
404
+ let p2_norm = "Second heading\n\nBody paragraph two.";
405
+ let p3_norm = "Conclusion.";
406
+ let content = format!("{p1_norm}\n\n{p2_norm}\n\n{p3_norm}");
407
+
408
+ let pages = vec![make_page(1, p1_raw), make_page(2, p2_raw), make_page(3, p3_raw)];
409
+ let boundaries = recompute_boundaries_from_pages(&content, &pages);
410
+
411
+ assert_eq!(boundaries.len(), 3);
412
+ assert_eq!(&content[boundaries[0].byte_start..boundaries[0].byte_end], p1_norm);
413
+ assert_eq!(&content[boundaries[1].byte_start..boundaries[1].byte_end], p2_norm);
414
+ assert_eq!(&content[boundaries[2].byte_start..boundaries[2].byte_end], p3_norm);
415
+ }
416
+ }
@@ -270,11 +270,13 @@ fn load_tokenizer(
270
270
  {
271
271
  for (_, value) in &map {
272
272
  if let Some(content) = value.as_str() {
273
- tokenizer.add_special_tokens(&[AddedToken {
274
- content: content.to_string(),
275
- special: true,
276
- ..Default::default()
277
- }]);
273
+ tokenizer
274
+ .add_special_tokens([AddedToken {
275
+ content: content.to_string(),
276
+ special: true,
277
+ ..Default::default()
278
+ }])
279
+ .map_err(|e| crate::KreuzbergError::embedding(format!("Failed to add special token: {e}")))?;
278
280
  } else if value.is_object()
279
281
  && let (Some(content), Some(single_word), Some(lstrip), Some(rstrip), Some(normalized)) = (
280
282
  value["content"].as_str(),
@@ -284,14 +286,16 @@ fn load_tokenizer(
284
286
  value["normalized"].as_bool(),
285
287
  )
286
288
  {
287
- tokenizer.add_special_tokens(&[AddedToken {
288
- content: content.to_string(),
289
- special: true,
290
- single_word,
291
- lstrip,
292
- rstrip,
293
- normalized,
294
- }]);
289
+ tokenizer
290
+ .add_special_tokens([AddedToken {
291
+ content: content.to_string(),
292
+ special: true,
293
+ single_word,
294
+ lstrip,
295
+ rstrip,
296
+ normalized,
297
+ }])
298
+ .map_err(|e| crate::KreuzbergError::embedding(format!("Failed to add special token: {e}")))?;
295
299
  }
296
300
  }
297
301
  }
@@ -74,12 +74,23 @@ fn maybe_transcode_utf16(data: &[u8]) -> Option<Vec<u8>> {
74
74
  (true, 2)
75
75
  } else if data[0] == 0xFE && data[1] == 0xFF {
76
76
  (false, 2)
77
- } else if data[1] == 0x00 && data[3] == 0x00 && data[0] != 0x00 && data[2] != 0x00 {
78
- // No BOM, but looks like UTF-16 LE (e.g. "M\0I\0M\0E\0")
79
- (true, 0)
80
- } else if data[0] == 0x00 && data[2] == 0x00 && data[1] != 0x00 && data[3] != 0x00 {
81
- // No BOM, but looks like UTF-16 BE (e.g. "\0M\0I\0M\0E")
82
- (false, 0)
77
+ } else if data.len() >= 16 {
78
+ let is_le_heuristic = data[1] == 0x00 && data[3] == 0x00 && data[5] == 0x00 && data[7] == 0x00;
79
+ let is_be_heuristic = data[0] == 0x00 && data[2] == 0x00 && data[4] == 0x00 && data[6] == 0x00;
80
+
81
+ if is_le_heuristic || is_be_heuristic {
82
+ let mut detector = chardetng::EncodingDetector::new(chardetng::Iso2022JpDetection::Allow);
83
+ detector.feed(data, true);
84
+ let guess = detector.guess(None, chardetng::Utf8Detection::Allow);
85
+
86
+ if guess.name() == "UTF-8" || guess.name() == "windows-1252" {
87
+ (is_le_heuristic, 0)
88
+ } else {
89
+ return None;
90
+ }
91
+ } else {
92
+ return None;
93
+ }
83
94
  } else {
84
95
  return None;
85
96
  };
@@ -553,6 +564,8 @@ Courier{\\colortbl\\red0\\green0\\blue0\r\n\\par \\pard\\plain\\f0\\fs20\\b\\i\\
553
564
  \\scaps\\outline\\shadow\\imprint\\emboss\\lang1024\\sbasedon1033\\fcharset0 {\\*\\cs10 \\additive \
554
565
  Default Paragraph Font}";
555
566
 
567
+ const MAX_RTF_DECOMPRESSED_CAPACITY: usize = 16 * 1024 * 1024;
568
+
556
569
  /// Decompress a PR_RTF_COMPRESSED stream per the MS-OXRTFCP specification.
557
570
  ///
558
571
  /// Returns `None` when the data is too short, has a bad magic number, or
@@ -585,7 +598,7 @@ fn decompress_rtf_compressed(data: &[u8]) -> Option<Vec<u8>> {
585
598
  // comp_size includes the 12 bytes after the first u32, so input length should be comp_size - 12.
586
599
  let end = (comp_size.saturating_sub(12)).min(input.len());
587
600
 
588
- let mut output = Vec::with_capacity(raw_size as usize);
601
+ let mut output = Vec::with_capacity((raw_size as usize).min(MAX_RTF_DECOMPRESSED_CAPACITY));
589
602
  let mut pos = 0usize;
590
603
 
591
604
  while pos < end {
@@ -2105,6 +2118,44 @@ mod tests {
2105
2118
  assert_eq!(headers.get("user_agent").unwrap(), "MyAgent/1.0");
2106
2119
  }
2107
2120
 
2121
+ #[test]
2122
+ fn test_maybe_transcode_utf16_short_binary_does_not_trigger_heuristic() {
2123
+ assert!(maybe_transcode_utf16(&[b'M', 0, b'I', 0]).is_none());
2124
+ }
2125
+
2126
+ #[test]
2127
+ fn test_decompress_rtf_compressed_crafted_raw_size_does_not_over_allocate() {
2128
+ let mut data = Vec::with_capacity(20);
2129
+ data.extend_from_slice(&16u32.to_le_bytes());
2130
+ data.extend_from_slice(&0xFFFF_FFFFu32.to_le_bytes());
2131
+ data.extend_from_slice(&0x75465a4cu32.to_le_bytes());
2132
+ data.extend_from_slice(&0u32.to_le_bytes());
2133
+ data.extend_from_slice(&[0x00, b'A', b'B', b'C']);
2134
+
2135
+ let out = decompress_rtf_compressed(&data).expect("crafted size should not force OOM");
2136
+ assert!(out.len() < 16, "output should stay tiny");
2137
+ }
2138
+
2139
+ #[test]
2140
+ fn test_decompress_rtf_compressed_cap_is_hint_only() {
2141
+ let payload: &[u8] = &[
2142
+ 0x00, b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', 0x00, b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P',
2143
+ 0x00, b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X',
2144
+ ];
2145
+ let comp_size = (12 + payload.len()) as u32;
2146
+ let raw_size = 1u32;
2147
+ let mut data = Vec::new();
2148
+ data.extend_from_slice(&comp_size.to_le_bytes());
2149
+ data.extend_from_slice(&raw_size.to_le_bytes());
2150
+ data.extend_from_slice(&0x75465a4cu32.to_le_bytes());
2151
+ data.extend_from_slice(&0u32.to_le_bytes());
2152
+ data.extend_from_slice(payload);
2153
+
2154
+ let out = decompress_rtf_compressed(&data).expect("should decompress");
2155
+ assert_eq!(out.len(), 24);
2156
+ assert_eq!(&out[..8], b"ABCDEFGH");
2157
+ }
2158
+
2108
2159
  #[test]
2109
2160
  fn test_decompress_rtf_compressed_too_short() {
2110
2161
  assert!(decompress_rtf_compressed(&[0u8; 10]).is_none());