kreuzberg 4.9.7 → 4.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.toml +5 -5
  4. data/lib/kreuzberg/version.rb +1 -1
  5. data/vendor/Cargo.toml +12 -13
  6. data/vendor/kreuzberg/Cargo.toml +37 -31
  7. data/vendor/kreuzberg/README.md +1 -1
  8. data/vendor/kreuzberg/src/core/config/pdf.rs +2 -5
  9. data/vendor/kreuzberg/src/core/config/tree_sitter.rs +0 -1
  10. data/vendor/kreuzberg/src/core/extractor/bytes.rs +6 -1
  11. data/vendor/kreuzberg/src/core/extractor/file.rs +6 -1
  12. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -0
  13. data/vendor/kreuzberg/src/core/pipeline/features.rs +115 -15
  14. data/vendor/kreuzberg/src/embeddings/mod.rs +17 -13
  15. data/vendor/kreuzberg/src/extraction/email.rs +58 -7
  16. data/vendor/kreuzberg/src/extraction/image_ocr.rs +72 -0
  17. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +0 -168
  18. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +1 -410
  19. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +41 -15
  20. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +45 -1
  21. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +13 -0
  22. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +62 -11
  23. data/vendor/kreuzberg/src/llm/structured.rs +22 -17
  24. data/vendor/kreuzberg/src/llm/vlm_ocr.rs +11 -6
  25. data/vendor/kreuzberg/src/pdf/images.rs +22 -4
  26. data/vendor/kreuzberg/src/pdf/mod.rs +0 -16
  27. data/vendor/kreuzberg/src/pdf/rendering.rs +53 -6
  28. data/vendor/kreuzberg/src/pdf/structure/mod.rs +0 -2
  29. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +12 -890
  30. data/vendor/kreuzberg/src/table_core.rs +8 -1
  31. data/vendor/kreuzberg/tests/extraction_timeout_tests.rs +26 -0
  32. data/vendor/kreuzberg/tests/pdf_markdown_quality.rs +1 -2
  33. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +35 -1
  34. data/vendor/kreuzberg-ffi/Cargo.toml +7 -7
  35. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  36. data/vendor/kreuzberg-ffi/src/config/loader.rs +39 -24
  37. data/vendor/kreuzberg-ffi/src/config/mod.rs +0 -4
  38. data/vendor/kreuzberg-ffi/src/lib.rs +1 -2
  39. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +2 -2
  40. data/vendor/kreuzberg-paddle-ocr/src/ocr_utils.rs +3 -3
  41. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  42. data/vendor/kreuzberg-tesseract/Cargo.toml +4 -4
  43. metadata +2 -10
  44. data/vendor/kreuzberg/src/pdf/oxide/annotations.rs +0 -258
  45. data/vendor/kreuzberg/src/pdf/oxide/hierarchy.rs +0 -235
  46. data/vendor/kreuzberg/src/pdf/oxide/images.rs +0 -53
  47. data/vendor/kreuzberg/src/pdf/oxide/metadata.rs +0 -381
  48. data/vendor/kreuzberg/src/pdf/oxide/mod.rs +0 -43
  49. data/vendor/kreuzberg/src/pdf/oxide/table.rs +0 -247
  50. data/vendor/kreuzberg/src/pdf/oxide/text.rs +0 -250
  51. data/vendor/kreuzberg/src/pdf/oxide_text.rs +0 -121
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 24151d9931038d170843ec2b42b0f34b65d78a6f07418683ba71a96c7b2f4f50
4
- data.tar.gz: 701aedb58613376393f8e168dbd2ecea211063172cbfdaae7af96590d990e8c1
3
+ metadata.gz: de92334e109bbca1bdd22469a651f146bf29eee730dfc841c3dcb4703ee3ba5b
4
+ data.tar.gz: 53140e24511ff0910814325859b3e7382ee9f511e2412181812607f0f3516f33
5
5
  SHA512:
6
- metadata.gz: b8917f84b8d8f7861c2f71a41248244344da6444942267e65b7feb04123017f099d618b6d72632d710936983f8ee9771e6847447e2119363d6e123caf8cc4f60
7
- data.tar.gz: 59fb7bd5a2c079fde4221aa1d578542cdf1b5f3637b28787d735d7a87b6fc86dd0f239e0623d4284c39d1175bed1072f4291a644c435d7242db9e5a3abeb5575
6
+ metadata.gz: 80b7a6fa716b1adf28d543074581d5f88984aae738669e52116c49d301b1116ed7311c4bcfa90d1853a9d98752b0d2ad13b5c0e14e9f2623217ff319c007eca2
7
+ data.tar.gz: fab95d4048b382cf3ff4e154d7979bc4963e35e57c802cd5f871950d460ef16c5745c66931fbbf15b02a479f0a991a9e1832a126d445c002c7bf79b92ae143b9
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.7" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.9" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.9.7"
3
+ version = "4.9.9"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -49,13 +49,13 @@ kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
49
49
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
50
50
  "rb-sys",
51
51
  ] }
52
- rb-sys = { version = "0.9.126", default-features = false, features = [
52
+ rb-sys = { version = "0.9.128", default-features = false, features = [
53
53
  "stable-api-compiled-fallback",
54
54
  ] }
55
- serde_json = "1.0.149"
55
+ serde_json = "1.0.150"
56
56
  toml = "1.1.2"
57
57
  serde_yaml_ng = "0.10"
58
- tokio = { version = "1.52.1", features = [
58
+ tokio = { version = "1.52.3", features = [
59
59
  "rt",
60
60
  "rt-multi-thread",
61
61
  "macros",
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
65
65
  "time",
66
66
  "io-util",
67
67
  ] }
68
- html-to-markdown-rs = { version = "3.3.1", default-features = false }
68
+ html-to-markdown-rs = { version = "3.5.7", default-features = false }
69
69
 
70
70
  [dev-dependencies]
71
71
  pretty_assertions = "1.4"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.9.7'
4
+ VERSION = '4.9.9'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.9.7"
5
+ version = "4.9.9"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -23,41 +23,40 @@ clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
23
23
  comrak = { version = "0.52", default-features = false }
24
24
  console_error_panic_hook = "0.1"
25
25
  criterion = { version = "0.8", features = ["html_reports"] }
26
- ctor = "0.10"
26
+ ctor = "1.0"
27
27
  dbase = "0.7"
28
28
  futures = "0.3"
29
29
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
30
30
  hex = "0.4.3"
31
- html-to-markdown-rs = { version = "3.3.1", default-features = false }
31
+ html-to-markdown-rs = { version = "3.5.7", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.9.7", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.7" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.9.9", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.9" }
37
37
  lazy_static = "1.5.0"
38
- libc = "0.2.185"
39
- liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
38
+ libc = "0.2.186"
39
+ liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false }
40
40
  log = "0.4"
41
- lzma-rust2 = { version = "0.16.2" }
41
+ lzma-rust2 = { version = "0.16.4" }
42
42
  memmap2 = "0.9"
43
43
  minijinja = "2"
44
44
  num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.37", default-features = false }
49
48
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
49
  rayon = "1.12.0"
51
- reqwest = { version = "0.13.2", default-features = false }
50
+ reqwest = { version = "0.13.4", default-features = false }
52
51
  serde = { version = "1.0.228", features = ["derive"] }
53
- serde_json = { version = "1.0.149" }
52
+ serde_json = { version = "1.0.150" }
54
53
  serde_toon_format = "0.1"
55
54
  tempfile = "3.27.0"
56
55
  thiserror = "2.0.18"
57
- tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
56
+ tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
58
57
  toml = "1.1.2"
59
58
  tracing = "0.1"
60
- tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false }
59
+ tree-sitter-language-pack = { version = "1.8.1", features = ["serde"], default-features = false }
61
60
  wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
62
61
  wasm-bindgen-futures = "0.4"
63
62
  web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.9.7"
3
+ version = "4.9.9"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -39,10 +39,6 @@ simd-utf8 = ["dep:simdutf8"]
39
39
  tokio-runtime = ["dep:tokio"]
40
40
 
41
41
  pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image", "dep:flate2", "html"]
42
- # Experimental: use pdf_oxide for text extraction (pure Rust, no C++ deps).
43
- # Provides cleaner word spacing for PDFs with broken font CMaps.
44
- # Requires 'pdf' feature. Not included in 'full' — opt-in only.
45
- pdf-oxide = ["pdf", "dep:pdf_oxide"]
46
42
  static-pdfium = ["pdf"]
47
43
  bundled-pdfium = ["pdf"]
48
44
  system-pdfium = ["pdf"]
@@ -61,7 +57,14 @@ office = [
61
57
  ]
62
58
  hwp = ["dep:cfb", "dep:flate2"]
63
59
  iwork = ["dep:zip", "dep:snap"]
64
- email = ["dep:mail-parser", "dep:cfb", "dep:outlook-pst", "dep:tempfile", "dep:chrono"]
60
+ email = [
61
+ "dep:mail-parser",
62
+ "dep:cfb",
63
+ "dep:outlook-pst",
64
+ "dep:tempfile",
65
+ "dep:chrono",
66
+ "dep:chardetng",
67
+ ]
65
68
  html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"]
66
69
  xml = ["dep:quick-xml", "dep:roxmltree"]
67
70
  archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
@@ -254,12 +257,12 @@ biblib = { version = "0.4", default-features = false, features = [
254
257
  bitvec = "1.0"
255
258
  blake3 = "1"
256
259
  bytes = { version = "1", features = ["serde"] }
257
- calamine = { version = "0.34.0", features = ["dates"], optional = true }
260
+ calamine = { version = "0.35.0", features = ["dates"], optional = true }
258
261
  cfb = { version = "0.14", optional = true }
259
262
  chardetng = { version = "1.0.0", optional = true }
260
263
  chrono = { version = "0.4", optional = true }
261
264
  comrak = { version = "0.52", default-features = false }
262
- dashmap = "6.1"
265
+ dashmap = "6.2"
263
266
  dbase = { version = "0.7", optional = true }
264
267
  dirs = "6"
265
268
  encoding_rs = { version = "0.8.35" }
@@ -271,7 +274,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
271
274
  "simd",
272
275
  ], optional = true }
273
276
  hex = "0.4.3"
274
- html-to-markdown-rs = { version = "3.3.1", default-features = false, features = [
277
+ html-to-markdown-rs = { version = "3.5.7", default-features = false, features = [
275
278
  "inline-images",
276
279
  "metadata",
277
280
  ], optional = true }
@@ -291,20 +294,20 @@ jotdown = "0.10"
291
294
  kamadak-exif = { version = "0.6.1", optional = true }
292
295
 
293
296
  kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
294
- libc = "0.2.185"
295
- liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false, optional = true }
297
+ libc = "0.2.186"
298
+ liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false, optional = true }
296
299
  log = "0.4"
297
- lopdf = { version = "0.40.0", optional = true }
298
- mail-parser = { version = "0.11.2", optional = true }
299
- memchr = "2.8.0"
300
+ lopdf = { version = "0.41.0", optional = true }
301
+ mail-parser = { version = "0.11.3", optional = true }
302
+ memchr = "2.8.1"
300
303
  memmap2 = "0.9"
301
304
  mime_guess = "2.0"
302
305
  minijinja = { version = "2", optional = true }
303
306
  ndarray = { version = "0.17", optional = true }
304
307
  num_cpus = "1.17.0"
305
308
  once_cell = "1.21.4"
306
- opentelemetry = { version = "0.31", features = ["trace"], optional = true }
307
- opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"], optional = true }
309
+ opentelemetry = { version = "0.32", features = ["trace"], optional = true }
310
+ opentelemetry_sdk = { version = "0.32", features = ["rt-tokio"], optional = true }
308
311
  org = { version = "0.3", optional = true }
309
312
  ort = { version = "2.0.0-rc.12", default-features = false, features = [
310
313
  "std",
@@ -314,14 +317,13 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
317
  outlook-pst = { version = "1.2.0", optional = true }
315
318
  parking_lot = "0.12.5"
316
319
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.37", default-features = false, optional = true }
318
320
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
321
  pulldown-cmark = { version = "0.13" }
320
- quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
322
+ quick-xml = { version = "0.40.1", features = ["serialize"], optional = true }
321
323
  rake = { version = "0.3.6", optional = true }
322
324
  rayon = "1.12.0"
323
325
  regex = "1.12.3"
324
- rmcp = { version = "1.5.0", features = [
326
+ rmcp = { version = "1.7.0", features = [
325
327
  "server",
326
328
  "macros",
327
329
  "base64",
@@ -333,32 +335,36 @@ rmp-serde = "1.3"
333
335
 
334
336
  roxmltree = { version = "0.21.1", optional = true }
335
337
  serde = { version = "1.0.228", features = ["derive"] }
336
- serde_json = { version = "1.0.149" }
338
+ serde_json = { version = "1.0.150" }
337
339
  serde_toon_format = "0.1"
338
340
  serde_yaml_ng = "0.10.0"
339
341
  sevenz-rust2 = { version = "0.20.2", optional = true }
340
342
  sha2 = { version = "0.11", optional = true }
341
343
  simdutf8 = { version = "0.1", optional = true }
342
344
  snap = { version = "1.1", optional = true }
343
- tar = { version = "0.4.45", optional = true }
345
+ tar = { version = "0.4.46", optional = true }
344
346
  tempfile = { version = "3.27.0", optional = true }
345
- text-splitter = { version = "0.30.1", features = ["markdown"], optional = true }
347
+ text-splitter = { version = "0.31.0", features = ["markdown"], optional = true }
346
348
  thiserror = "2.0.18"
347
349
  tiff = { version = "0.11", optional = true }
348
- tokenizers = { version = "0.22", optional = true, default-features = false, features = [
350
+ # Keep aligned with text-splitter's optional tokenizers integration so ChunkSizer
351
+ # is implemented for the same Tokenizer type used by Kreuzberg.
352
+ tokenizers = { version = "0.23.1", optional = true, default-features = false, features = [
349
353
  "http",
350
354
  "fancy-regex",
351
355
  ] }
352
- tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
356
+ tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
353
357
  toml = "1.1.2"
354
358
  tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
355
359
  tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
356
360
  tracing = "0.1"
357
- tracing-opentelemetry = { version = "0.32", optional = true }
361
+ tracing-opentelemetry = { version = "0.33", optional = true }
358
362
  unicode-normalization = { version = "0.1.25", optional = true }
359
363
  urlencoding = "2"
360
- utoipa = { version = "5.4", features = ["axum_extras"], optional = true }
361
- v_htmlescape = { version = "0.15", optional = true }
364
+ utoipa = { version = "5.5", features = ["axum_extras"], optional = true }
365
+ # Pinned to 0.15 v_htmlescape 0.17 renamed `escape` fn to an `Escape` struct.
366
+ # Update call sites in src/rendering/html_styled.rs before bumping.
367
+ v_htmlescape = { version = "=0.15.8", optional = true }
362
368
  whatlang = { version = "0.18.0", optional = true }
363
369
  zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
364
370
  "deflate-flate2",
@@ -392,7 +398,7 @@ optional = true
392
398
  # Override getrandom to enable js feature for WASM targets
393
399
  # This is needed because ring/rustls (via ureq) depend on getrandom without js feature
394
400
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
395
- tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false, optional = true }
401
+ tree-sitter-language-pack = { version = "1.8.1", features = ["serde"], default-features = false, optional = true }
396
402
  wasm-bindgen-rayon = { version = "1.3", optional = true }
397
403
 
398
404
  [build-dependencies]
@@ -406,10 +412,10 @@ dotenvy = "0.15"
406
412
  filetime = "0.2"
407
413
  image = { version = "0.25.10", default-features = false, features = ["png"] }
408
414
  jsonschema = "0.46"
409
- serial_test = "3.4.0"
410
- tar = "0.4.45"
415
+ serial_test = "3.5.0"
416
+ tar = "0.4.46"
411
417
  tempfile = "3.27.0"
412
- tokio = { version = "1.52.1", features = ["macros", "time"] }
418
+ tokio = { version = "1.52.3", features = ["macros", "time"] }
413
419
  tokio-test = "0.4"
414
420
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
415
421
  zip = { version = ">=7.0.0, <8.6.0", default-features = false, features = ["deflate-flate2"] }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.9.7 Release**
21
+ > **🚀 Version 4.9.9 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -8,17 +8,14 @@ use serde::{Deserialize, Serialize};
8
8
  /// PDF extraction backend selection.
9
9
  ///
10
10
  /// Controls which PDF library is used for text extraction:
11
- /// - `Pdfium`: pdfium-render (default, C++ based, mature)
12
- /// - `PdfOxide`: pdf_oxide (pure Rust, faster, requires `pdf-oxide` feature)
13
- /// - `Auto`: automatically select based on available features
11
+ /// - `Pdfium`: pdfium-render (default, mature)
12
+ /// - `Auto`: automatically select the default available backend
14
13
  #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
15
14
  #[serde(rename_all = "lowercase")]
16
15
  pub enum PdfBackend {
17
16
  /// Use pdfium-render backend (default).
18
17
  #[default]
19
18
  Pdfium,
20
- /// Use pdf_oxide backend (pure Rust). Requires `pdf-oxide` feature.
21
- PdfOxide,
22
19
  /// Automatically select the best available backend.
23
20
  Auto,
24
21
  }
@@ -155,7 +155,6 @@ impl From<&TreeSitterProcessConfig> for tree_sitter_language_pack::ProcessConfig
155
155
  symbols: p.symbols,
156
156
  diagnostics: p.diagnostics,
157
157
  chunk_max_size: p.chunk_max_size,
158
- extractions: None,
159
158
  }
160
159
  }
161
160
  }
@@ -128,7 +128,12 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
128
128
 
129
129
  #[cfg(not(feature = "tokio-runtime"))]
130
130
  let result = {
131
- let _ = config.extraction_timeout_secs;
131
+ if config.extraction_timeout_secs.is_some() {
132
+ return Err(crate::KreuzbergError::Validation {
133
+ message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
134
+ source: None,
135
+ });
136
+ }
132
137
  extraction_future.await
133
138
  };
134
139
 
@@ -142,7 +142,12 @@ pub async fn extract_file(
142
142
 
143
143
  #[cfg(not(feature = "tokio-runtime"))]
144
144
  let result = {
145
- let _ = config.extraction_timeout_secs;
145
+ if config.extraction_timeout_secs.is_some() {
146
+ return Err(crate::KreuzbergError::Validation {
147
+ message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
148
+ source: None,
149
+ });
150
+ }
146
151
  extraction_future.await
147
152
  };
148
153
 
@@ -35,6 +35,13 @@ pub(super) fn extract_bytes_sync_impl(
35
35
  let cfg = config.cloned().unwrap_or_default();
36
36
  let cfg = cfg.normalized().into_owned();
37
37
 
38
+ if cfg.extraction_timeout_secs.is_some() {
39
+ return Err(crate::KreuzbergError::Validation {
40
+ message: "extraction_timeout_secs requires the 'tokio-runtime' feature to be enabled".to_string(),
41
+ source: None,
42
+ });
43
+ }
44
+
38
45
  let validated_mime = if let Some(mime) = mime_type {
39
46
  if mime == "application/octet-stream" {
40
47
  mime::detect_mime_type_from_bytes(content)?
@@ -37,10 +37,19 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
37
37
  continue;
38
38
  }
39
39
 
40
- // Try exact match first
41
- if let Some(pos) = content[search_offset..].find(&page.content) {
40
+ let normalized: String = page
41
+ .content
42
+ .split("\n\n")
43
+ .map(str::trim)
44
+ .filter(|s| !s.is_empty())
45
+ .collect::<Vec<_>>()
46
+ .join("\n\n");
47
+
48
+ // Try normalized exact match first. PDF page text can contain trailing
49
+ // spaces that render_plain strips before chunking.
50
+ if let Some(pos) = content[search_offset..].find(normalized.as_str()) {
42
51
  let byte_start = search_offset + pos;
43
- let byte_end = content.floor_char_boundary(byte_start + page.content.len());
52
+ let byte_end = content.floor_char_boundary(byte_start + normalized.len());
44
53
  boundaries.push(PageBoundary {
45
54
  page_number: page.page_number,
46
55
  byte_start,
@@ -50,12 +59,12 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
50
59
  continue;
51
60
  }
52
61
 
53
- // Fallback: search for first non-empty line of page content
62
+ // Fallback: search for first non-empty line of page content.
54
63
  if let Some(line) = page.content.lines().find(|l| !l.trim().is_empty()).map(|l| l.trim())
55
64
  && let Some(pos) = content[search_offset..].find(line)
56
65
  {
57
66
  let byte_start = search_offset + pos;
58
- let raw_end = (byte_start + page.content.len()).min(content.len());
67
+ let raw_end = (byte_start + normalized.len()).min(content.len());
59
68
  let byte_end = content.floor_char_boundary(raw_end);
60
69
  boundaries.push(PageBoundary {
61
70
  page_number: page.page_number,
@@ -176,25 +185,27 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
176
185
  let resolved_config = chunking_config.resolve_preset();
177
186
  let chunking_config = &resolved_config;
178
187
 
179
- // Recompute page boundaries against `result.content` (rendered by `render_plain`)
180
- // if per-page content is available. The boundaries stored in
181
- // `result.metadata.pages.boundaries` were computed against the raw extractor text
182
- // and may have different byte offsets than the rendered content (fix for #636).
188
+ let (chunk_input, heading_source) = if config.output_format != crate::core::config::OutputFormat::Plain {
189
+ (
190
+ result.formatted_content.as_deref().unwrap_or(result.content.as_str()),
191
+ None,
192
+ )
193
+ } else {
194
+ (result.content.as_str(), result.formatted_content.as_deref())
195
+ };
196
+
183
197
  let recomputed_boundaries: Option<Vec<PageBoundary>> = result
184
198
  .pages
185
199
  .as_deref()
186
- .map(|pages| recompute_boundaries_from_pages(&result.content, pages));
200
+ .map(|pages| recompute_boundaries_from_pages(chunk_input, pages))
201
+ .filter(|boundaries| !boundaries.is_empty());
187
202
 
188
203
  let page_boundaries: Option<&[PageBoundary]> = recomputed_boundaries
189
204
  .as_deref()
190
205
  .or_else(|| result.metadata.pages.as_ref().and_then(|ps| ps.boundaries.as_deref()));
191
206
 
192
- // Pass formatted_content (markdown) for heading context resolution when available.
193
- // Plain-text rendering strips heading markers, but the markdown chunker needs them
194
- // to build the heading hierarchy for chunk metadata.
195
- let heading_source = result.formatted_content.as_deref();
196
207
  match crate::chunking::chunk_text_with_heading_source(
197
- &result.content,
208
+ chunk_input,
198
209
  chunking_config,
199
210
  page_boundaries,
200
211
  heading_source,
@@ -314,3 +325,92 @@ pub(super) fn execute_token_reduction(result: &mut ExtractionResult, config: &Ex
314
325
 
315
326
  Ok(())
316
327
  }
328
+
329
+ #[cfg(test)]
330
+ #[cfg(feature = "chunking")]
331
+ mod tests {
332
+ use super::*;
333
+ use crate::core::config::{ChunkerType, ChunkingConfig, OutputFormat};
334
+ use crate::types::PageContent;
335
+
336
+ fn make_page(page_number: usize, content: &str) -> PageContent {
337
+ PageContent {
338
+ page_number,
339
+ content: content.to_string(),
340
+ tables: Vec::new(),
341
+ images: Vec::new(),
342
+ hierarchy: None,
343
+ is_blank: None,
344
+ layout_regions: None,
345
+ }
346
+ }
347
+
348
+ fn markdown_chunking_config() -> ExtractionConfig {
349
+ ExtractionConfig {
350
+ output_format: OutputFormat::Markdown,
351
+ chunking: Some(ChunkingConfig {
352
+ max_characters: 2000,
353
+ overlap: 0,
354
+ trim: true,
355
+ chunker_type: ChunkerType::Markdown,
356
+ ..Default::default()
357
+ }),
358
+ ..Default::default()
359
+ }
360
+ }
361
+
362
+ #[test]
363
+ fn chunks_content_is_markdown_when_output_format_is_markdown() {
364
+ let mut result = ExtractionResult {
365
+ content: "SH-001 Luca Bianchi Common Germany 3500000".to_string(),
366
+ formatted_content: Some("| SH-001 | Luca Bianchi | Common | Germany | 3,500,000 |".to_string()),
367
+ mime_type: Cow::Borrowed("application/pdf"),
368
+ ..Default::default()
369
+ };
370
+
371
+ execute_chunking(&mut result, &markdown_chunking_config()).unwrap();
372
+
373
+ let chunks = result.chunks.expect("chunks must be populated");
374
+ assert!(!chunks.is_empty());
375
+ assert!(chunks.iter().any(|chunk| chunk.content.contains('|')));
376
+ assert!(chunks.iter().all(|chunk| !chunk.content.starts_with("SH-001 Luca")));
377
+ assert!(result.formatted_content.is_some());
378
+ }
379
+
380
+ #[test]
381
+ fn markdown_chunks_preserve_page_metadata_when_formatted_pages_match() {
382
+ let mut result = ExtractionResult {
383
+ content: "Page one text\n\nPage two text".to_string(),
384
+ formatted_content: Some("# Page one\n\nPage one text\n\n# Page two\n\nPage two text".to_string()),
385
+ pages: Some(vec![make_page(1, "Page one text"), make_page(2, "Page two text")]),
386
+ mime_type: Cow::Borrowed("application/pdf"),
387
+ ..Default::default()
388
+ };
389
+
390
+ execute_chunking(&mut result, &markdown_chunking_config()).unwrap();
391
+
392
+ let chunks = result.chunks.expect("chunks must be populated");
393
+ assert!(!chunks.is_empty());
394
+ assert!(chunks.iter().any(|chunk| chunk.metadata.first_page.is_some()));
395
+ assert!(chunks.iter().any(|chunk| chunk.metadata.last_page.is_some()));
396
+ }
397
+
398
+ #[test]
399
+ fn recompute_boundaries_trailing_space_pages_all_resolve() {
400
+ let p1_raw = "Heading \n\nBody paragraph one. ";
401
+ let p2_raw = "Second heading \n\nBody paragraph two. ";
402
+ let p3_raw = "Conclusion. ";
403
+ let p1_norm = "Heading\n\nBody paragraph one.";
404
+ let p2_norm = "Second heading\n\nBody paragraph two.";
405
+ let p3_norm = "Conclusion.";
406
+ let content = format!("{p1_norm}\n\n{p2_norm}\n\n{p3_norm}");
407
+
408
+ let pages = vec![make_page(1, p1_raw), make_page(2, p2_raw), make_page(3, p3_raw)];
409
+ let boundaries = recompute_boundaries_from_pages(&content, &pages);
410
+
411
+ assert_eq!(boundaries.len(), 3);
412
+ assert_eq!(&content[boundaries[0].byte_start..boundaries[0].byte_end], p1_norm);
413
+ assert_eq!(&content[boundaries[1].byte_start..boundaries[1].byte_end], p2_norm);
414
+ assert_eq!(&content[boundaries[2].byte_start..boundaries[2].byte_end], p3_norm);
415
+ }
416
+ }
@@ -270,11 +270,13 @@ fn load_tokenizer(
270
270
  {
271
271
  for (_, value) in &map {
272
272
  if let Some(content) = value.as_str() {
273
- tokenizer.add_special_tokens(&[AddedToken {
274
- content: content.to_string(),
275
- special: true,
276
- ..Default::default()
277
- }]);
273
+ tokenizer
274
+ .add_special_tokens([AddedToken {
275
+ content: content.to_string(),
276
+ special: true,
277
+ ..Default::default()
278
+ }])
279
+ .map_err(|e| crate::KreuzbergError::embedding(format!("Failed to add special token: {e}")))?;
278
280
  } else if value.is_object()
279
281
  && let (Some(content), Some(single_word), Some(lstrip), Some(rstrip), Some(normalized)) = (
280
282
  value["content"].as_str(),
@@ -284,14 +286,16 @@ fn load_tokenizer(
284
286
  value["normalized"].as_bool(),
285
287
  )
286
288
  {
287
- tokenizer.add_special_tokens(&[AddedToken {
288
- content: content.to_string(),
289
- special: true,
290
- single_word,
291
- lstrip,
292
- rstrip,
293
- normalized,
294
- }]);
289
+ tokenizer
290
+ .add_special_tokens([AddedToken {
291
+ content: content.to_string(),
292
+ special: true,
293
+ single_word,
294
+ lstrip,
295
+ rstrip,
296
+ normalized,
297
+ }])
298
+ .map_err(|e| crate::KreuzbergError::embedding(format!("Failed to add special token: {e}")))?;
295
299
  }
296
300
  }
297
301
  }