kreuzberg 4.7.0 → 4.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.lock +25 -24
  4. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  5. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
  6. data/lib/kreuzberg/version.rb +1 -1
  7. data/vendor/Cargo.toml +6 -6
  8. data/vendor/kreuzberg/Cargo.toml +7 -6
  9. data/vendor/kreuzberg/README.md +1 -1
  10. data/vendor/kreuzberg/src/api/handlers.rs +3 -22
  11. data/vendor/kreuzberg/src/cache/core.rs +1 -3
  12. data/vendor/kreuzberg/src/cache_dir.rs +53 -0
  13. data/vendor/kreuzberg/src/chunking/boundaries.rs +26 -5
  14. data/vendor/kreuzberg/src/chunking/processor.rs +9 -0
  15. data/vendor/kreuzberg/src/core/config/concurrency.rs +3 -0
  16. data/vendor/kreuzberg/src/core/config/extraction/core.rs +15 -8
  17. data/vendor/kreuzberg/src/core/extractor/batch.rs +1 -1
  18. data/vendor/kreuzberg/src/core/extractor/helpers.rs +1 -0
  19. data/vendor/kreuzberg/src/core/pipeline/features.rs +80 -1
  20. data/vendor/kreuzberg/src/core/pipeline/tests.rs +43 -0
  21. data/vendor/kreuzberg/src/doc_orientation.rs +13 -9
  22. data/vendor/kreuzberg/src/embeddings/mod.rs +35 -19
  23. data/vendor/kreuzberg/src/extraction/derive.rs +1 -0
  24. data/vendor/kreuzberg/src/extraction/image_ocr.rs +7 -12
  25. data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +1 -0
  26. data/vendor/kreuzberg/src/extraction/transform/mod.rs +6 -0
  27. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +2 -0
  28. data/vendor/kreuzberg/src/extractors/odt.rs +21 -0
  29. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +32 -3
  30. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +77 -105
  31. data/vendor/kreuzberg/src/keywords/processor.rs +7 -0
  32. data/vendor/kreuzberg/src/language_detection/processor.rs +5 -0
  33. data/vendor/kreuzberg/src/lib.rs +1 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +4 -0
  35. data/vendor/kreuzberg/src/mcp/params.rs +31 -0
  36. data/vendor/kreuzberg/src/mcp/server.rs +3 -12
  37. data/vendor/kreuzberg/src/model_download.rs +4 -11
  38. data/vendor/kreuzberg/src/ocr/cache.rs +13 -10
  39. data/vendor/kreuzberg/src/ocr/hocr_parser.rs +471 -10
  40. data/vendor/kreuzberg/src/ocr/layout_assembly.rs +18 -13
  41. data/vendor/kreuzberg/src/ocr/processor/execution.rs +89 -31
  42. data/vendor/kreuzberg/src/ocr/processor/mod.rs +1 -0
  43. data/vendor/kreuzberg/src/ocr/tessdata_manager.rs +1 -10
  44. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  45. data/vendor/kreuzberg/src/ocr/types.rs +1 -0
  46. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +166 -1
  47. data/vendor/kreuzberg/src/paddle_ocr/config.rs +16 -12
  48. data/vendor/kreuzberg/src/pdf/layout_runner.rs +48 -0
  49. data/vendor/kreuzberg/src/pdf/structure/adapters.rs +338 -79
  50. data/vendor/kreuzberg/src/pdf/structure/assembly.rs +84 -6
  51. data/vendor/kreuzberg/src/pdf/structure/bridge.rs +1 -1
  52. data/vendor/kreuzberg/src/pdf/structure/content_convert.rs +0 -3
  53. data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs +49 -2
  54. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +156 -1
  55. data/vendor/kreuzberg/src/pdf/structure/text_repair.rs +47 -16
  56. data/vendor/kreuzberg/src/pdf/table_reconstruct.rs +28 -1
  57. data/vendor/kreuzberg/src/pdf/text.rs +100 -0
  58. data/vendor/kreuzberg/src/plugins/ocr.rs +1 -0
  59. data/vendor/kreuzberg/src/plugins/processor/mod.rs +8 -0
  60. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +2 -0
  61. data/vendor/kreuzberg/src/plugins/validator/mod.rs +11 -0
  62. data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +4 -2
  63. data/vendor/kreuzberg/src/rendering/djot.rs +3 -2
  64. data/vendor/kreuzberg/src/rendering/markdown.rs +15 -0
  65. data/vendor/kreuzberg/src/text/quality_processor.rs +5 -0
  66. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +28 -10
  67. data/vendor/kreuzberg/src/types/extraction.rs +8 -0
  68. data/vendor/kreuzberg/src/types/formats.rs +5 -0
  69. data/vendor/kreuzberg/tests/config_features.rs +6 -33
  70. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +2 -2
  71. data/vendor/kreuzberg-ffi/Cargo.toml +4 -4
  72. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  73. data/vendor/kreuzberg-ffi/src/helpers.rs +5 -0
  74. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +5 -3
  75. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +7 -4
  76. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +6 -4
  77. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +3 -2
  78. data/vendor/kreuzberg-ffi/src/result.rs +1 -0
  79. data/vendor/kreuzberg-ffi/src/result_view.rs +2 -0
  80. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  81. data/vendor/kreuzberg-paddle-ocr/src/angle_net.rs +4 -4
  82. data/vendor/kreuzberg-paddle-ocr/src/base_net.rs +1 -1
  83. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  84. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  85. data/vendor/kreuzberg-tesseract/build.rs +1 -1
  86. data/vendor/kreuzberg-tesseract/src/leptonica.rs +57 -0
  87. metadata +3 -9
  88. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -68
  89. data/vendor/kreuzberg/examples/download_paddle_models.rs +0 -358
  90. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  91. data/vendor/kreuzberg-pdfium-render/examples/artifact_check.rs +0 -70
  92. data/vendor/kreuzberg-pdfium-render/examples/char_order_check.rs +0 -63
  93. data/vendor/kreuzberg-pdfium-render/examples/ffi_bench.rs +0 -207
  94. data/vendor/kreuzberg-pdfium-render/examples/seg_dump.rs +0 -54
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 86f862f2bccbe3ad535c3d392c8bd04fbe6976de1dfd296a264a8a9f413f4617
4
- data.tar.gz: 15aa589f639a12a312d3827b6c392ac39f5a9d7a2c5ebb82174dea9f9e687a06
3
+ metadata.gz: c713d0a652ed4e1752ce0c3f6e044516e870349de149b2fdd5cb6c429c722d65
4
+ data.tar.gz: b519e5b70800b61bd6e5d6acef3a0fee0de8d53f0ad625e863753573f79caf4f
5
5
  SHA512:
6
- metadata.gz: 6bfa0020321043200dde821eafbdca7e6ba1b0abeec4bbb41b43599d3e2cc36c74caf8174673e444e8b1e90c15ef3a8babbac43810529b109ec73c1878bbe6fa
7
- data.tar.gz: ce6e20a7d4dba4617c426fa0589d9e22c74ef85fdec9229ae525d2357fa3134f6e1a52c61d83c8321dc1f8c8ac745dbea702a07544005bafbe847df387d67ab7
6
+ metadata.gz: 0cec6f964c7975e905997422f296a129fe32974c221300f1d6f89d5f54b27ca48e7343522e961d044cc90d9c7f0f4ef20c42de0c95b20e6794614cd1243bd03b
7
+ data.tar.gz: 8b173be0cb820ade74c4e572465a28257911e9cd1d4ed510f93d26e65529c47f1e637f59a215171b8f931f4a45037bb9b9e5069fd4542b4f911fa36908c65364
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.0.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.2" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -699,9 +699,9 @@ dependencies = [
699
699
 
700
700
  [[package]]
701
701
  name = "cc"
702
- version = "1.2.58"
702
+ version = "1.2.59"
703
703
  source = "registry+https://github.com/rust-lang/crates.io-index"
704
- checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1"
704
+ checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283"
705
705
  dependencies = [
706
706
  "find-msvc-tools",
707
707
  "jobserver",
@@ -928,9 +928,9 @@ checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
928
928
 
929
929
  [[package]]
930
930
  name = "comrak"
931
- version = "0.51.0"
931
+ version = "0.52.0"
932
932
  source = "registry+https://github.com/rust-lang/crates.io-index"
933
- checksum = "9f07383e7799d964bf7ffa6fc4457d177c54a44614661c7458bb0bd91b108e32"
933
+ checksum = "aac0b255932a9cd52fbfd664b67957f9f2e095ae4711cb0e41b4e291edef94c2"
934
934
  dependencies = [
935
935
  "caseless",
936
936
  "entities",
@@ -2550,9 +2550,9 @@ checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8"
2550
2550
 
2551
2551
  [[package]]
2552
2552
  name = "indexmap"
2553
- version = "2.13.0"
2553
+ version = "2.13.1"
2554
2554
  source = "registry+https://github.com/rust-lang/crates.io-index"
2555
- checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017"
2555
+ checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff"
2556
2556
  dependencies = [
2557
2557
  "equivalent",
2558
2558
  "hashbrown 0.16.1",
@@ -2792,7 +2792,7 @@ dependencies = [
2792
2792
 
2793
2793
  [[package]]
2794
2794
  name = "kreuzberg"
2795
- version = "4.7.0"
2795
+ version = "4.7.2"
2796
2796
  dependencies = [
2797
2797
  "ahash",
2798
2798
  "async-trait",
@@ -2810,6 +2810,7 @@ dependencies = [
2810
2810
  "comrak",
2811
2811
  "dashmap",
2812
2812
  "dbase",
2813
+ "dirs",
2813
2814
  "encoding_rs",
2814
2815
  "fast_image_resize",
2815
2816
  "flate2",
@@ -2884,7 +2885,7 @@ dependencies = [
2884
2885
 
2885
2886
  [[package]]
2886
2887
  name = "kreuzberg-ffi"
2887
- version = "4.7.0"
2888
+ version = "4.7.2"
2888
2889
  dependencies = [
2889
2890
  "ahash",
2890
2891
  "async-trait",
@@ -2900,7 +2901,7 @@ dependencies = [
2900
2901
 
2901
2902
  [[package]]
2902
2903
  name = "kreuzberg-paddle-ocr"
2903
- version = "4.7.0"
2904
+ version = "4.7.2"
2904
2905
  dependencies = [
2905
2906
  "geo-clipper",
2906
2907
  "geo-types",
@@ -2914,7 +2915,7 @@ dependencies = [
2914
2915
 
2915
2916
  [[package]]
2916
2917
  name = "kreuzberg-pdfium-render"
2917
- version = "4.7.0"
2918
+ version = "4.7.2"
2918
2919
  dependencies = [
2919
2920
  "bitflags",
2920
2921
  "bytemuck",
@@ -2937,7 +2938,7 @@ dependencies = [
2937
2938
 
2938
2939
  [[package]]
2939
2940
  name = "kreuzberg-rb"
2940
- version = "4.7.0"
2941
+ version = "4.7.1"
2941
2942
  dependencies = [
2942
2943
  "async-trait",
2943
2944
  "html-to-markdown-rs",
@@ -2954,7 +2955,7 @@ dependencies = [
2954
2955
 
2955
2956
  [[package]]
2956
2957
  name = "kreuzberg-tesseract"
2957
- version = "4.7.0"
2958
+ version = "4.7.2"
2958
2959
  dependencies = [
2959
2960
  "cc",
2960
2961
  "cmake",
@@ -4264,18 +4265,18 @@ dependencies = [
4264
4265
 
4265
4266
  [[package]]
4266
4267
  name = "rb-sys"
4267
- version = "0.9.125"
4268
+ version = "0.9.126"
4268
4269
  source = "registry+https://github.com/rust-lang/crates.io-index"
4269
- checksum = "85b37650fabd8ba515910a0dc089dcb6348eb3c35fbf91698cb226435be2babc"
4270
+ checksum = "284799e73e899fe946fd77c7211b83bff61a1356e039ade7a2516a779e3212d0"
4270
4271
  dependencies = [
4271
4272
  "rb-sys-build",
4272
4273
  ]
4273
4274
 
4274
4275
  [[package]]
4275
4276
  name = "rb-sys-build"
4276
- version = "0.9.125"
4277
+ version = "0.9.126"
4277
4278
  source = "registry+https://github.com/rust-lang/crates.io-index"
4278
- checksum = "c73b806faa66006e491458b48a78725621c1ac5a2a6efe2614c90711a7780b80"
4279
+ checksum = "855fc1ad8943d12c89ef12f9147f1cc531f5bf19fb744112fdd317bb6ee7b5c5"
4279
4280
  dependencies = [
4280
4281
  "bindgen",
4281
4282
  "lazy_static",
@@ -4712,9 +4713,9 @@ dependencies = [
4712
4713
 
4713
4714
  [[package]]
4714
4715
  name = "semver"
4715
- version = "1.0.27"
4716
+ version = "1.0.28"
4716
4717
  source = "registry+https://github.com/rust-lang/crates.io-index"
4717
- checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
4718
+ checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
4718
4719
 
4719
4720
  [[package]]
4720
4721
  name = "seq-macro"
@@ -5374,9 +5375,9 @@ dependencies = [
5374
5375
 
5375
5376
  [[package]]
5376
5377
  name = "tokio"
5377
- version = "1.50.0"
5378
+ version = "1.51.0"
5378
5379
  source = "registry+https://github.com/rust-lang/crates.io-index"
5379
- checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d"
5380
+ checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd"
5380
5381
  dependencies = [
5381
5382
  "bytes",
5382
5383
  "libc",
@@ -5390,9 +5391,9 @@ dependencies = [
5390
5391
 
5391
5392
  [[package]]
5392
5393
  name = "tokio-macros"
5393
- version = "2.6.1"
5394
+ version = "2.7.0"
5394
5395
  source = "registry+https://github.com/rust-lang/crates.io-index"
5395
- checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c"
5396
+ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
5396
5397
  dependencies = [
5397
5398
  "proc-macro2",
5398
5399
  "quote",
@@ -6616,9 +6617,9 @@ dependencies = [
6616
6617
 
6617
6618
  [[package]]
6618
6619
  name = "writeable"
6619
- version = "0.6.2"
6620
+ version = "0.6.3"
6620
6621
  source = "registry+https://github.com/rust-lang/crates.io-index"
6621
- checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
6622
+ checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
6622
6623
 
6623
6624
  [[package]]
6624
6625
  name = "wyz"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.7.0"
3
+ version = "4.7.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -49,13 +49,13 @@ kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
49
49
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
50
50
  "rb-sys",
51
51
  ] }
52
- rb-sys = { version = "0.9.125", default-features = false, features = [
52
+ rb-sys = { version = "0.9.126", default-features = false, features = [
53
53
  "stable-api-compiled-fallback",
54
54
  ] }
55
55
  serde_json = "1.0.149"
56
56
  toml = "1.1.2"
57
57
  serde_yaml_ng = "0.10"
58
- tokio = { version = "1.50.0", features = [
58
+ tokio = { version = "1.51.0", features = [
59
59
  "rt",
60
60
  "rt-multi-thread",
61
61
  "macros",
@@ -109,6 +109,7 @@ impl OcrBackend for RubyOcrBackend {
109
109
  code_intelligence: None,
110
110
  formatted_content: None,
111
111
  uris: None,
112
+ ocr_internal_document: None,
112
113
  })
113
114
  })
114
115
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.7.0'
4
+ VERSION = '4.7.2'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.7.0"
5
+ version = "4.7.2"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -20,7 +20,7 @@ bytes = { version = "1", features = ["serde"] }
20
20
  cfb = "0.14"
21
21
  chrono = "0.4"
22
22
  clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
23
- comrak = { version = "0.51", default-features = false }
23
+ comrak = { version = "0.52", default-features = false }
24
24
  console_error_panic_hook = "0.1"
25
25
  criterion = { version = "0.8", features = ["html_reports"] }
26
26
  ctor = "0.8"
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.7.0", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.0" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.7.2", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.2" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.184"
39
39
  log = "0.4"
@@ -43,7 +43,7 @@ num_cpus = "1.17.0"
43
43
  once_cell = "1.21.4"
44
44
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
45
45
  parking_lot = "0.12.5"
46
- pdf_oxide = { version = "0.3.18", default-features = false }
46
+ pdf_oxide = { version = "0.3.19", default-features = false }
47
47
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
48
48
  rayon = "1.11.0"
49
49
  reqwest = { version = "0.13.2", default-features = false }
@@ -52,7 +52,7 @@ serde_json = { version = "1.0.149" }
52
52
  serde_toon_format = "0.1"
53
53
  tempfile = "3.27.0"
54
54
  thiserror = "2.0.18"
55
- tokio = { version = "1.50.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
55
+ tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
56
56
  toml = "1.1.2"
57
57
  tracing = "0.1"
58
58
  tree-sitter-language-pack = { version = "1.4.1", features = ["serde"], default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.7.0"
3
+ version = "4.7.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -253,9 +253,10 @@ calamine = { version = "0.34.0", features = ["dates"], optional = true }
253
253
  cfb = { version = "0.14", optional = true }
254
254
  chardetng = { version = "1.0.0", optional = true }
255
255
  chrono = { version = "0.4", optional = true }
256
- comrak = { version = "0.51", default-features = false }
256
+ comrak = { version = "0.52", default-features = false }
257
257
  dashmap = "6.1"
258
258
  dbase = { version = "0.7", optional = true }
259
+ dirs = "6"
259
260
  encoding_rs = { version = "0.8.35" }
260
261
  fast_image_resize = { version = "6.0.0", optional = true }
261
262
  flate2 = { version = "1.1", optional = true }
@@ -279,7 +280,7 @@ image = { version = "0.25.10", default-features = false, features = [
279
280
  "pnm",
280
281
  "rayon",
281
282
  ], optional = true }
282
- indexmap = "2.13.0"
283
+ indexmap = "2.13.1"
283
284
  infer = "0.19.0"
284
285
  jotdown = "0.9"
285
286
  kamadak-exif = { version = "0.6.1", optional = true }
@@ -306,7 +307,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
306
307
  outlook-pst = { version = "1.2.0", optional = true }
307
308
  parking_lot = "0.12.5"
308
309
  pastey = "0.2"
309
- pdf_oxide = { version = "0.3.18", default-features = false, optional = true }
310
+ pdf_oxide = { version = "0.3.19", default-features = false, optional = true }
310
311
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
311
312
  pulldown-cmark = { version = "0.13" }
312
313
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
@@ -341,7 +342,7 @@ tokenizers = { version = "0.22", optional = true, default-features = false, feat
341
342
  "http",
342
343
  "fancy-regex",
343
344
  ] }
344
- tokio = { version = "1.50.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
345
+ tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
345
346
  toml = "1.1.2"
346
347
  tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
347
348
  tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
@@ -400,7 +401,7 @@ jsonschema = "0.45"
400
401
  serial_test = "3.4.0"
401
402
  tar = "0.4.45"
402
403
  tempfile = "3.27.0"
403
- tokio = { version = "1.50.0", features = ["macros", "time"] }
404
+ tokio = { version = "1.51.0", features = ["macros", "time"] }
404
405
  tokio-test = "0.4"
405
406
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
406
407
  zip = { version = ">=7.0.0, <7.4.0", default-features = false, features = ["deflate-flate2"] }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.7.0 Release**
21
+ > **🚀 Version 4.7.2 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -316,14 +316,7 @@ pub async fn formats_handler() -> Json<Vec<crate::SupportedFormat>> {
316
316
  )]
317
317
  #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
318
318
  pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
319
- let cache_dir = std::env::current_dir()
320
- .map_err(|e| {
321
- ApiError::internal(crate::error::KreuzbergError::Other(format!(
322
- "Failed to get current directory: {}",
323
- e
324
- )))
325
- })?
326
- .join(".kreuzberg");
319
+ let cache_dir = crate::cache_dir::resolve_cache_base();
327
320
 
328
321
  let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
329
322
  ApiError::internal(crate::error::KreuzbergError::Other(format!(
@@ -365,14 +358,7 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
365
358
  )]
366
359
  #[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
367
360
  pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
368
- let cache_dir = std::env::current_dir()
369
- .map_err(|e| {
370
- ApiError::internal(crate::error::KreuzbergError::Other(format!(
371
- "Failed to get current directory: {}",
372
- e
373
- )))
374
- })?
375
- .join(".kreuzberg");
361
+ let cache_dir = crate::cache_dir::resolve_cache_base();
376
362
 
377
363
  let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
378
364
  ApiError::internal(crate::error::KreuzbergError::Other(format!(
@@ -932,12 +918,7 @@ pub async fn cache_warm_handler(JsonApi(request): JsonApi<WarmRequest>) -> Resul
932
918
 
933
919
  /// Resolve the cache base directory.
934
920
  fn resolve_cache_base() -> std::path::PathBuf {
935
- if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
936
- return std::path::PathBuf::from(env_path);
937
- }
938
- std::env::current_dir()
939
- .unwrap_or_else(|_| std::path::PathBuf::from("."))
940
- .join(".kreuzberg")
921
+ crate::cache_dir::resolve_cache_base()
941
922
  }
942
923
 
943
924
  #[cfg(test)]
@@ -66,10 +66,8 @@ impl GenericCache {
66
66
  ) -> Result<Self> {
67
67
  let cache_dir_path = if let Some(dir) = cache_dir {
68
68
  PathBuf::from(dir).join(&cache_type)
69
- } else if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
70
- PathBuf::from(env_path).join(&cache_type)
71
69
  } else {
72
- std::env::current_dir()?.join(".kreuzberg").join(&cache_type)
70
+ crate::cache_dir::resolve_cache_dir(&cache_type)
73
71
  };
74
72
 
75
73
  fs::create_dir_all(&cache_dir_path)
@@ -0,0 +1,53 @@
1
+ //! Centralized cache directory resolution for all kreuzberg modules.
2
+ //!
3
+ //! Provides a single function that all modules use to determine where to store
4
+ //! cached data (models, OCR results, tessdata, etc.). This avoids per-CWD
5
+ //! `.kreuzberg/` directories and uses platform-appropriate global cache locations.
6
+
7
+ use std::path::PathBuf;
8
+
9
+ /// Resolve the kreuzberg cache base directory (without a module suffix).
10
+ ///
11
+ /// Uses the same resolution order as [`resolve_cache_dir`] but returns
12
+ /// the top-level kreuzberg cache directory.
13
+ #[allow(dead_code)]
14
+ pub fn resolve_cache_base() -> PathBuf {
15
+ if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
16
+ return PathBuf::from(env_path);
17
+ }
18
+ if let Some(cache) = dirs::cache_dir() {
19
+ return cache.join("kreuzberg");
20
+ }
21
+ if let Some(home) = dirs::home_dir() {
22
+ return home.join(".cache").join("kreuzberg");
23
+ }
24
+ std::env::current_dir()
25
+ .unwrap_or_else(|_| PathBuf::from("."))
26
+ .join(".kreuzberg")
27
+ }
28
+
29
+ /// Resolve the kreuzberg cache directory for a given module.
30
+ ///
31
+ /// Resolution order:
32
+ /// 1. `KREUZBERG_CACHE_DIR` env var + `/{module}` (explicit override)
33
+ /// 2. Platform-appropriate global cache directory:
34
+ /// - macOS: `~/Library/Caches/kreuzberg/{module}`
35
+ /// - Linux: `$XDG_CACHE_HOME/kreuzberg/{module}` or `~/.cache/kreuzberg/{module}`
36
+ /// - Windows: `%LOCALAPPDATA%/kreuzberg/{module}`
37
+ /// 3. Home directory fallback: `~/.cache/kreuzberg/{module}`
38
+ /// 4. CWD-relative fallback: `.kreuzberg/{module}` (last resort, e.g. no HOME set)
39
+ pub fn resolve_cache_dir(module: &str) -> PathBuf {
40
+ if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
41
+ return PathBuf::from(env_path).join(module);
42
+ }
43
+ if let Some(cache) = dirs::cache_dir() {
44
+ return cache.join("kreuzberg").join(module);
45
+ }
46
+ if let Some(home) = dirs::home_dir() {
47
+ return home.join(".cache").join("kreuzberg").join(module);
48
+ }
49
+ std::env::current_dir()
50
+ .unwrap_or_else(|_| PathBuf::from("."))
51
+ .join(".kreuzberg")
52
+ .join(module)
53
+ }
@@ -28,9 +28,9 @@ pub fn validate_page_boundaries(boundaries: &[PageBoundary]) -> Result<()> {
28
28
  }
29
29
 
30
30
  for (idx, boundary) in boundaries.iter().enumerate() {
31
- if boundary.byte_start >= boundary.byte_end {
31
+ if boundary.byte_start > boundary.byte_end {
32
32
  return Err(KreuzbergError::validation(format!(
33
- "Invalid boundary range at index {}: byte_start ({}) must be < byte_end ({})",
33
+ "Invalid boundary range at index {}: byte_start ({}) must be <= byte_end ({})",
34
34
  idx, boundary.byte_start, boundary.byte_end
35
35
  )));
36
36
  }
@@ -287,6 +287,7 @@ mod tests {
287
287
 
288
288
  #[test]
289
289
  fn test_chunk_with_same_start_and_end() {
290
+ // Zero-length boundaries are valid (empty pages)
290
291
  let boundaries = vec![PageBoundary {
291
292
  byte_start: 10,
292
293
  byte_end: 10,
@@ -294,8 +295,28 @@ mod tests {
294
295
  }];
295
296
 
296
297
  let result = validate_page_boundaries(&boundaries);
297
- assert!(result.is_err());
298
- let err = result.unwrap_err();
299
- assert!(err.to_string().contains("Invalid boundary range"));
298
+ assert!(result.is_ok());
299
+ }
300
+
301
+ #[test]
302
+ fn test_zero_length_boundary_is_valid() {
303
+ let boundaries = vec![
304
+ PageBoundary {
305
+ byte_start: 0,
306
+ byte_end: 100,
307
+ page_number: 1,
308
+ },
309
+ PageBoundary {
310
+ byte_start: 100,
311
+ byte_end: 100,
312
+ page_number: 2,
313
+ }, // empty page
314
+ PageBoundary {
315
+ byte_start: 100,
316
+ byte_end: 200,
317
+ page_number: 3,
318
+ },
319
+ ];
320
+ assert!(validate_page_boundaries(&boundaries).is_ok());
300
321
  }
301
322
  }
@@ -146,6 +146,7 @@ mod tests {
146
146
  #[cfg(feature = "tree-sitter")]
147
147
  code_intelligence: None,
148
148
  formatted_content: None,
149
+ ocr_internal_document: None,
149
150
  };
150
151
 
151
152
  processor.process(&mut result, &config).await.unwrap();
@@ -183,6 +184,7 @@ mod tests {
183
184
  #[cfg(feature = "tree-sitter")]
184
185
  code_intelligence: None,
185
186
  formatted_content: None,
187
+ ocr_internal_document: None,
186
188
  };
187
189
 
188
190
  processor.process(&mut result, &config).await.unwrap();
@@ -232,6 +234,7 @@ mod tests {
232
234
  #[cfg(feature = "tree-sitter")]
233
235
  code_intelligence: None,
234
236
  formatted_content: None,
237
+ ocr_internal_document: None,
235
238
  };
236
239
 
237
240
  let config_with_chunking = ExtractionConfig {
@@ -277,6 +280,7 @@ mod tests {
277
280
  #[cfg(feature = "tree-sitter")]
278
281
  code_intelligence: None,
279
282
  formatted_content: None,
283
+ ocr_internal_document: None,
280
284
  };
281
285
 
282
286
  let long_result = ExtractionResult {
@@ -302,6 +306,7 @@ mod tests {
302
306
  #[cfg(feature = "tree-sitter")]
303
307
  code_intelligence: None,
304
308
  formatted_content: None,
309
+ ocr_internal_document: None,
305
310
  };
306
311
 
307
312
  let short_duration = processor.estimated_duration_ms(&short_result);
@@ -356,6 +361,7 @@ mod tests {
356
361
  #[cfg(feature = "tree-sitter")]
357
362
  code_intelligence: None,
358
363
  formatted_content: None,
364
+ ocr_internal_document: None,
359
365
  };
360
366
 
361
367
  processor.process(&mut result, &config).await.unwrap();
@@ -402,6 +408,7 @@ mod tests {
402
408
  #[cfg(feature = "tree-sitter")]
403
409
  code_intelligence: None,
404
410
  formatted_content: None,
411
+ ocr_internal_document: None,
405
412
  };
406
413
 
407
414
  processor.process(&mut result, &config).await.unwrap();
@@ -448,6 +455,7 @@ mod tests {
448
455
  #[cfg(feature = "tree-sitter")]
449
456
  code_intelligence: None,
450
457
  formatted_content: None,
458
+ ocr_internal_document: None,
451
459
  };
452
460
 
453
461
  processor.process(&mut result, &config).await.unwrap();
@@ -494,6 +502,7 @@ mod tests {
494
502
  #[cfg(feature = "tree-sitter")]
495
503
  code_intelligence: None,
496
504
  formatted_content: None,
505
+ ocr_internal_document: None,
497
506
  };
498
507
 
499
508
  processor.process(&mut result, &config).await.unwrap();
@@ -68,7 +68,10 @@ pub fn resolve_thread_budget(config: Option<&ConcurrencyConfig>) -> usize {
68
68
  /// ```
69
69
  pub fn init_thread_pools(budget: usize) {
70
70
  POOL_INIT.call_once(|| {
71
+ #[cfg(not(target_arch = "wasm32"))]
71
72
  rayon::ThreadPoolBuilder::new().num_threads(budget).build_global().ok();
73
+ #[cfg(target_arch = "wasm32")]
74
+ let _ = budget;
72
75
  });
73
76
  }
74
77
 
@@ -404,18 +404,25 @@ impl ExtractionConfig {
404
404
  /// - Auto-enabling `extract_pages` when `result_format` is `ElementBased`, because
405
405
  /// the element transformation requires per-page data to assign correct page numbers.
406
406
  /// Without this, all elements would incorrectly get `page_number=1`.
407
+ /// - Auto-enabling `extract_pages` when chunking is configured, because the chunker
408
+ /// needs page boundaries to assign correct page numbers to chunks.
407
409
  pub fn normalized(&self) -> std::borrow::Cow<'_, Self> {
408
- if self.result_format == crate::types::OutputFormat::ElementBased {
409
- let needs_pages = match &self.pages {
410
+ let needs_pages = |cfg: &Self| -> bool {
411
+ match &cfg.pages {
410
412
  Some(page_config) => !page_config.extract_pages,
411
413
  None => true,
412
- };
413
- if needs_pages {
414
- let mut config = self.clone();
415
- let page_config = config.pages.get_or_insert_with(super::super::page::PageConfig::default);
416
- page_config.extract_pages = true;
417
- return std::borrow::Cow::Owned(config);
418
414
  }
415
+ };
416
+
417
+ let needs_pages_for_elements =
418
+ self.result_format == crate::types::OutputFormat::ElementBased && needs_pages(self);
419
+ let needs_pages_for_chunking = self.chunking.is_some() && needs_pages(self);
420
+
421
+ if needs_pages_for_elements || needs_pages_for_chunking {
422
+ let mut config = self.clone();
423
+ let page_config = config.pages.get_or_insert_with(super::super::page::PageConfig::default);
424
+ page_config.extract_pages = true;
425
+ return std::borrow::Cow::Owned(config);
419
426
  }
420
427
  std::borrow::Cow::Borrowed(self)
421
428
  }
@@ -33,7 +33,7 @@ where
33
33
  let max_concurrent = config
34
34
  .max_concurrent_extractions
35
35
  .or_else(|| config.concurrency.as_ref().and_then(|c| c.max_threads))
36
- .unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
36
+ .unwrap_or_else(|| crate::core::config::concurrency::resolve_thread_budget(config.concurrency.as_ref()));
37
37
  let semaphore = Arc::new(Semaphore::new(max_concurrent));
38
38
 
39
39
  let mut tasks = JoinSet::new();
@@ -100,6 +100,7 @@ pub(crate) fn error_extraction_result(e: &KreuzbergError, elapsed_ms: Option<u64
100
100
  #[cfg(feature = "tree-sitter")]
101
101
  code_intelligence: None,
102
102
  formatted_content: None,
103
+ ocr_internal_document: None,
103
104
  }
104
105
  }
105
106