kreuzberg 4.8.5 → 4.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.lock +111 -90
  4. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  5. data/ext/kreuzberg_rb/native/src/config/types.rs +22 -0
  6. data/ext/kreuzberg_rb/native/src/error_handling.rs +7 -0
  7. data/ext/kreuzberg_rb/native/src/result.rs +7 -0
  8. data/lib/kreuzberg/errors.rb +3 -0
  9. data/lib/kreuzberg/result.rb +52 -5
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +26 -4
  12. data/vendor/Cargo.toml +7 -7
  13. data/vendor/kreuzberg/Cargo.toml +8 -8
  14. data/vendor/kreuzberg/README.md +1 -1
  15. data/vendor/kreuzberg/src/api/error.rs +1 -0
  16. data/vendor/kreuzberg/src/api/handlers.rs +74 -1
  17. data/vendor/kreuzberg/src/api/types.rs +11 -2
  18. data/vendor/kreuzberg/src/cancellation.rs +105 -0
  19. data/vendor/kreuzberg/src/chunking/boundary_detection.rs +496 -0
  20. data/vendor/kreuzberg/src/chunking/core.rs +8 -1
  21. data/vendor/kreuzberg/src/chunking/mod.rs +9 -10
  22. data/vendor/kreuzberg/src/chunking/semantic/merge.rs +477 -0
  23. data/vendor/kreuzberg/src/chunking/semantic/mod.rs +393 -0
  24. data/vendor/kreuzberg/src/chunking/semantic/topic.rs +224 -0
  25. data/vendor/kreuzberg/src/chunking/tokenizer_cache.rs +3 -3
  26. data/vendor/kreuzberg/src/core/config/extraction/core.rs +89 -1
  27. data/vendor/kreuzberg/src/core/config/layout.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config/llm.rs +47 -1
  29. data/vendor/kreuzberg/src/core/config/ocr.rs +16 -1
  30. data/vendor/kreuzberg/src/core/config/processing.rs +63 -0
  31. data/vendor/kreuzberg/src/core/extractor/bytes.rs +1 -1
  32. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -1
  33. data/vendor/kreuzberg/src/core/extractor/sync.rs +24 -21
  34. data/vendor/kreuzberg/src/core/formats.rs +2 -2
  35. data/vendor/kreuzberg/src/core/mime.rs +3 -3
  36. data/vendor/kreuzberg/src/core/pipeline/cache.rs +2 -2
  37. data/vendor/kreuzberg/src/doc_orientation.rs +22 -4
  38. data/vendor/kreuzberg/src/embeddings/mod.rs +250 -18
  39. data/vendor/kreuzberg/src/error.rs +6 -0
  40. data/vendor/kreuzberg/src/extraction/derive.rs +6 -1
  41. data/vendor/kreuzberg/src/extraction/docx/drawing.rs +2 -4
  42. data/vendor/kreuzberg/src/extraction/docx/mod.rs +185 -0
  43. data/vendor/kreuzberg/src/extraction/html/structure.rs +5 -7
  44. data/vendor/kreuzberg/src/extraction/image.rs +1 -0
  45. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +1 -0
  46. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -1
  47. data/vendor/kreuzberg/src/extraction/pst.rs +6 -7
  48. data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +3 -0
  49. data/vendor/kreuzberg/src/extraction/transform/mod.rs +5 -0
  50. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +38 -50
  51. data/vendor/kreuzberg/src/extractors/doc.rs +4 -1
  52. data/vendor/kreuzberg/src/extractors/docbook.rs +8 -12
  53. data/vendor/kreuzberg/src/extractors/docx.rs +16 -5
  54. data/vendor/kreuzberg/src/extractors/excel.rs +5 -2
  55. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +2 -4
  56. data/vendor/kreuzberg/src/extractors/html.rs +173 -1
  57. data/vendor/kreuzberg/src/extractors/image.rs +268 -37
  58. data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +4 -1
  59. data/vendor/kreuzberg/src/extractors/iwork/mod.rs +4 -8
  60. data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +4 -1
  61. data/vendor/kreuzberg/src/extractors/iwork/pages.rs +4 -1
  62. data/vendor/kreuzberg/src/extractors/markdown.rs +22 -32
  63. data/vendor/kreuzberg/src/extractors/mdx.rs +22 -32
  64. data/vendor/kreuzberg/src/extractors/mod.rs +7 -12
  65. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +4 -0
  66. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +161 -49
  67. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +148 -13
  68. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +47 -1
  69. data/vendor/kreuzberg/src/extractors/ppt.rs +3 -0
  70. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -0
  71. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +4 -5
  72. data/vendor/kreuzberg/src/keywords/mod.rs +6 -10
  73. data/vendor/kreuzberg/src/keywords/yake/preprocessor.rs +17 -19
  74. data/vendor/kreuzberg/src/language_detection/mod.rs +6 -10
  75. data/vendor/kreuzberg/src/layout/engine.rs +9 -2
  76. data/vendor/kreuzberg/src/layout/mod.rs +17 -6
  77. data/vendor/kreuzberg/src/layout/models/rtdetr.rs +5 -2
  78. data/vendor/kreuzberg/src/layout/models/slanet.rs +5 -2
  79. data/vendor/kreuzberg/src/layout/models/table_classifier.rs +5 -2
  80. data/vendor/kreuzberg/src/layout/models/tatr.rs +5 -2
  81. data/vendor/kreuzberg/src/layout/models/yolo.rs +2 -1
  82. data/vendor/kreuzberg/src/layout/session.rs +4 -51
  83. data/vendor/kreuzberg/src/lib.rs +2 -0
  84. data/vendor/kreuzberg/src/mcp/errors.rs +18 -0
  85. data/vendor/kreuzberg/src/mcp/params.rs +19 -1
  86. data/vendor/kreuzberg/src/mcp/server.rs +14 -3
  87. data/vendor/kreuzberg/src/ocr/processor/execution.rs +8 -16
  88. data/vendor/kreuzberg/src/ort_discovery.rs +75 -1
  89. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +43 -5
  90. data/vendor/kreuzberg/src/pdf/bindings.rs +40 -15
  91. data/vendor/kreuzberg/src/pdf/error.rs +3 -0
  92. data/vendor/kreuzberg/src/pdf/fonts.rs +2 -2
  93. data/vendor/kreuzberg/src/pdf/images.rs +1 -1
  94. data/vendor/kreuzberg/src/pdf/layout_runner.rs +1 -0
  95. data/vendor/kreuzberg/src/pdf/metadata.rs +3 -3
  96. data/vendor/kreuzberg/src/pdf/oxide/metadata.rs +2 -2
  97. data/vendor/kreuzberg/src/pdf/oxide/table.rs +10 -6
  98. data/vendor/kreuzberg/src/pdf/oxide/text.rs +1 -0
  99. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -1
  100. data/vendor/kreuzberg/src/pdf/structure/bridge.rs +1 -1
  101. data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs +2 -3
  102. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +121 -18
  103. data/vendor/kreuzberg/src/pdf/text.rs +2 -1
  104. data/vendor/kreuzberg/src/plugins/registry/mod.rs +11 -11
  105. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +30 -31
  106. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  107. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +2 -27
  108. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +2 -1
  109. data/vendor/kreuzberg/src/types/extraction.rs +1 -1
  110. data/vendor/kreuzberg/src/types/internal.rs +18 -0
  111. data/vendor/kreuzberg/src/types/mod.rs +5 -0
  112. data/vendor/kreuzberg/src/types/page.rs +26 -1
  113. data/vendor/kreuzberg/src/utils/markdown_utils.rs +40 -0
  114. data/vendor/kreuzberg/src/utils/mod.rs +1 -0
  115. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +3 -3
  116. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +2 -2
  117. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +2 -2
  118. data/vendor/kreuzberg/src/utils/string_utils.rs +7 -7
  119. data/vendor/kreuzberg/tests/cross_format_parity.rs +9 -4
  120. data/vendor/kreuzberg/tests/llm_integration.rs +2 -0
  121. data/vendor/kreuzberg/tests/ocr_content_integrity.rs +154 -0
  122. data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +42 -0
  123. data/vendor/kreuzberg-ffi/Cargo.toml +4 -4
  124. data/vendor/kreuzberg-ffi/kreuzberg.h +107 -4
  125. data/vendor/kreuzberg-ffi/src/cancellation.rs +167 -0
  126. data/vendor/kreuzberg-ffi/src/error.rs +32 -7
  127. data/vendor/kreuzberg-ffi/src/lib.rs +10 -4
  128. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  129. data/vendor/kreuzberg-paddle-ocr/src/ocr_lite.rs +21 -0
  130. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  131. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  132. data/vendor/kreuzberg-tesseract/build.rs +63 -37
  133. metadata +14 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 198e3a74bbf86420f51a7d5b8f80e5e110cc93f2e144fadfe129f8f90155fe40
4
- data.tar.gz: e54cfaf29ae9b910f5cc365a8095298e312ef93098c185dafa497b80a5f5859b
3
+ metadata.gz: 559b3104e6e21f2f14a92949d427703b51ea8b35b7a643d8964b6953785aa6e1
4
+ data.tar.gz: bfce92579c45ecba0da0d8e1f077ecca0bb9dd6a1e96c950e8beb8d0a39b5884
5
5
  SHA512:
6
- metadata.gz: e5d6e2d8a21b12d6984bd4fc3d4cdc7858b46d1d21e09e238c832da830f8329af93f33ac6997590a342b52f68c7448aa68d68b8a323397116682e1373be28d38
7
- data.tar.gz: d5a7c6df3f4a90f76e8554fffcb3cba45e274521f2efc96ecf4c882228b8398ed4ec0cb3322ccf1090b615b3ecae5b19b8f947e779fb3bf1a72c6e101d4184f1
6
+ metadata.gz: 1ea8af57d65eb5008126758041df2bfe07acca9d47ebfbf9c9de79f12b5d5ff2336d55b643268d1ea420db1825eaf9bfef6e5deb7335bd2449e9ccb62800492d
7
+ data.tar.gz: '019f2abaa7dcaf2b91925f7ed0b5332ce569a41a5ae1f152d698423cce1276ea68e177c2088b4377c369943e235c70584f61a76da5600d6dd8b3bd075bc266ab'
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.5" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.1" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -304,9 +304,9 @@ dependencies = [
304
304
 
305
305
  [[package]]
306
306
  name = "aws-lc-rs"
307
- version = "1.16.2"
307
+ version = "1.16.3"
308
308
  source = "registry+https://github.com/rust-lang/crates.io-index"
309
- checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc"
309
+ checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f"
310
310
  dependencies = [
311
311
  "aws-lc-sys",
312
312
  "zeroize",
@@ -314,9 +314,9 @@ dependencies = [
314
314
 
315
315
  [[package]]
316
316
  name = "aws-lc-sys"
317
- version = "0.39.1"
317
+ version = "0.40.0"
318
318
  source = "registry+https://github.com/rust-lang/crates.io-index"
319
- checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399"
319
+ checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7"
320
320
  dependencies = [
321
321
  "cc",
322
322
  "cmake",
@@ -415,7 +415,7 @@ checksum = "53d0c374feba1b9a59042a7c1cf00ce7c34b977b9134fe7c42b08e5183729f66"
415
415
  dependencies = [
416
416
  "paste",
417
417
  "roman-numerals-rs",
418
- "strum",
418
+ "strum 0.27.2",
419
419
  "unicode-normalization",
420
420
  "unscanny",
421
421
  ]
@@ -476,17 +476,17 @@ checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6"
476
476
 
477
477
  [[package]]
478
478
  name = "bitflags"
479
- version = "2.11.0"
479
+ version = "2.11.1"
480
480
  source = "registry+https://github.com/rust-lang/crates.io-index"
481
- checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
481
+ checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
482
482
 
483
483
  [[package]]
484
484
  name = "bitstream-io"
485
- version = "4.9.0"
485
+ version = "4.10.0"
486
486
  source = "registry+https://github.com/rust-lang/crates.io-index"
487
- checksum = "60d4bd9d1db2c6bdf285e223a7fa369d5ce98ec767dec949c6ca62863ce61757"
487
+ checksum = "7eff00be299a18769011411c9def0d827e8f2d7bf0c3dbf53633147a8867fd1f"
488
488
  dependencies = [
489
- "core2",
489
+ "no_std_io2",
490
490
  ]
491
491
 
492
492
  [[package]]
@@ -817,9 +817,9 @@ dependencies = [
817
817
 
818
818
  [[package]]
819
819
  name = "clap"
820
- version = "4.6.0"
820
+ version = "4.6.1"
821
821
  source = "registry+https://github.com/rust-lang/crates.io-index"
822
- checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351"
822
+ checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
823
823
  dependencies = [
824
824
  "clap_builder",
825
825
  ]
@@ -1065,15 +1065,6 @@ version = "0.8.7"
1065
1065
  source = "registry+https://github.com/rust-lang/crates.io-index"
1066
1066
  checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
1067
1067
 
1068
- [[package]]
1069
- name = "core2"
1070
- version = "0.4.0"
1071
- source = "registry+https://github.com/rust-lang/crates.io-index"
1072
- checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505"
1073
- dependencies = [
1074
- "memchr",
1075
- ]
1076
-
1077
1068
  [[package]]
1078
1069
  name = "core_maths"
1079
1070
  version = "0.1.1"
@@ -1162,9 +1153,9 @@ dependencies = [
1162
1153
 
1163
1154
  [[package]]
1164
1155
  name = "ctor"
1165
- version = "0.9.1"
1156
+ version = "0.10.0"
1166
1157
  source = "registry+https://github.com/rust-lang/crates.io-index"
1167
- checksum = "c1c888a2a4f677017373fb6c01e13e318dd9e78758445ed5eb985e355d3f8281"
1158
+ checksum = "95d0d11eb38e7642efca359c3cf6eb7b2e528182d09110165de70192b0352775"
1168
1159
  dependencies = [
1169
1160
  "ctor-proc-macro",
1170
1161
  "dtor",
@@ -1248,9 +1239,9 @@ dependencies = [
1248
1239
 
1249
1240
  [[package]]
1250
1241
  name = "dary_heap"
1251
- version = "0.3.8"
1242
+ version = "0.3.9"
1252
1243
  source = "registry+https://github.com/rust-lang/crates.io-index"
1253
- checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04"
1244
+ checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe"
1254
1245
  dependencies = [
1255
1246
  "serde",
1256
1247
  ]
@@ -1417,9 +1408,9 @@ dependencies = [
1417
1408
 
1418
1409
  [[package]]
1419
1410
  name = "dtor"
1420
- version = "0.6.0"
1411
+ version = "0.7.0"
1421
1412
  source = "registry+https://github.com/rust-lang/crates.io-index"
1422
- checksum = "30e4690622ab6700ced40fc370a3f07b7d111f0154bb6fb08f73b4c8834f75b6"
1413
+ checksum = "17f72721db8027a4e96dd6fb50d2a1d32259c9d3da1b63dee612ccd981e14293"
1423
1414
  dependencies = [
1424
1415
  "dtor-proc-macro",
1425
1416
  ]
@@ -1803,9 +1794,9 @@ dependencies = [
1803
1794
 
1804
1795
  [[package]]
1805
1796
  name = "geo-types"
1806
- version = "0.7.18"
1797
+ version = "0.7.19"
1807
1798
  source = "registry+https://github.com/rust-lang/crates.io-index"
1808
- checksum = "24f8647af4005fa11da47cd56252c6ef030be8fa97bdbf355e7dfb6348f0a82c"
1799
+ checksum = "94776032c45f950d30a13af6113c2ad5625316c9abfbccee4dd5a6695f8fe0f5"
1809
1800
  dependencies = [
1810
1801
  "approx",
1811
1802
  "num-traits",
@@ -2018,21 +2009,15 @@ dependencies = [
2018
2009
 
2019
2010
  [[package]]
2020
2011
  name = "hashbrown"
2021
- version = "0.16.1"
2012
+ version = "0.17.0"
2022
2013
  source = "registry+https://github.com/rust-lang/crates.io-index"
2023
- checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
2014
+ checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
2024
2015
  dependencies = [
2025
2016
  "allocator-api2",
2026
2017
  "equivalent",
2027
2018
  "foldhash 0.2.0",
2028
2019
  ]
2029
2020
 
2030
- [[package]]
2031
- name = "hashbrown"
2032
- version = "0.17.0"
2033
- source = "registry+https://github.com/rust-lang/crates.io-index"
2034
- checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
2035
-
2036
2021
  [[package]]
2037
2022
  name = "hashify"
2038
2023
  version = "0.2.9"
@@ -2053,9 +2038,9 @@ checksum = "9f4d0e94ddd48749f06bbe4e5389fb9799a0c45bcaf00495042076ef05e3241a"
2053
2038
 
2054
2039
  [[package]]
2055
2040
  name = "hayro-jbig2"
2056
- version = "0.2.0"
2041
+ version = "0.3.0"
2057
2042
  source = "registry+https://github.com/rust-lang/crates.io-index"
2058
- checksum = "a23692ac727653ca0450420479a8b60a31fa3c35fa37fb74bb36e0c69e88833d"
2043
+ checksum = "69374b3668dd45aeb3d3145cda68f2c7b4f223aaa2511e67d076f1c7d741388d"
2059
2044
  dependencies = [
2060
2045
  "hayro-ccitt",
2061
2046
  ]
@@ -2142,9 +2127,9 @@ dependencies = [
2142
2127
 
2143
2128
  [[package]]
2144
2129
  name = "html-to-markdown-rs"
2145
- version = "3.1.0"
2130
+ version = "3.2.5"
2146
2131
  source = "registry+https://github.com/rust-lang/crates.io-index"
2147
- checksum = "c116fad2be99a76c9d9bce6197e0e3271de05f26a967b30888ba07eb669bc666"
2132
+ checksum = "bcb619abe81160bba2e2185823e10f6c0793220a266f16791aa715287de322cd"
2148
2133
  dependencies = [
2149
2134
  "ahash",
2150
2135
  "astral-tl",
@@ -2248,9 +2233,9 @@ dependencies = [
2248
2233
 
2249
2234
  [[package]]
2250
2235
  name = "hyper-rustls"
2251
- version = "0.27.8"
2236
+ version = "0.27.9"
2252
2237
  source = "registry+https://github.com/rust-lang/crates.io-index"
2253
- checksum = "c2b52f86d1d4bc0d6b4e6826d960b1b333217e07d36b882dca570a5e1c48895b"
2238
+ checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f"
2254
2239
  dependencies = [
2255
2240
  "http",
2256
2241
  "hyper",
@@ -2783,7 +2768,7 @@ dependencies = [
2783
2768
 
2784
2769
  [[package]]
2785
2770
  name = "kreuzberg"
2786
- version = "4.8.3"
2771
+ version = "4.8.6"
2787
2772
  dependencies = [
2788
2773
  "ahash",
2789
2774
  "async-trait",
@@ -2878,7 +2863,7 @@ dependencies = [
2878
2863
 
2879
2864
  [[package]]
2880
2865
  name = "kreuzberg-ffi"
2881
- version = "4.8.3"
2866
+ version = "4.8.6"
2882
2867
  dependencies = [
2883
2868
  "ahash",
2884
2869
  "async-trait",
@@ -2894,7 +2879,7 @@ dependencies = [
2894
2879
 
2895
2880
  [[package]]
2896
2881
  name = "kreuzberg-paddle-ocr"
2897
- version = "4.8.3"
2882
+ version = "4.8.6"
2898
2883
  dependencies = [
2899
2884
  "geo-clipper",
2900
2885
  "geo-types",
@@ -2908,7 +2893,7 @@ dependencies = [
2908
2893
 
2909
2894
  [[package]]
2910
2895
  name = "kreuzberg-pdfium-render"
2911
- version = "4.8.3"
2896
+ version = "4.8.6"
2912
2897
  dependencies = [
2913
2898
  "bitflags",
2914
2899
  "bytemuck",
@@ -2931,7 +2916,7 @@ dependencies = [
2931
2916
 
2932
2917
  [[package]]
2933
2918
  name = "kreuzberg-rb"
2934
- version = "4.8.5"
2919
+ version = "4.9.1"
2935
2920
  dependencies = [
2936
2921
  "async-trait",
2937
2922
  "html-to-markdown-rs",
@@ -2948,7 +2933,7 @@ dependencies = [
2948
2933
 
2949
2934
  [[package]]
2950
2935
  name = "kreuzberg-tesseract"
2951
- version = "4.8.3"
2936
+ version = "4.8.6"
2952
2937
  dependencies = [
2953
2938
  "cc",
2954
2939
  "cmake",
@@ -2977,9 +2962,9 @@ checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8"
2977
2962
 
2978
2963
  [[package]]
2979
2964
  name = "libbz2-rs-sys"
2980
- version = "0.2.2"
2965
+ version = "0.2.3"
2981
2966
  source = "registry+https://github.com/rust-lang/crates.io-index"
2982
- checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7"
2967
+ checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f"
2983
2968
 
2984
2969
  [[package]]
2985
2970
  name = "libc"
@@ -3037,9 +3022,9 @@ dependencies = [
3037
3022
 
3038
3023
  [[package]]
3039
3024
  name = "link-section"
3040
- version = "0.0.12"
3025
+ version = "0.2.0"
3041
3026
  source = "registry+https://github.com/rust-lang/crates.io-index"
3042
- checksum = "f52437d47b0358721ec869cc7374b2a21f7b2237af9b439c0391341a1fbfbf1b"
3027
+ checksum = "468808413fa8bdf0edbe61c2bbc182dfc59885b94f496cf3fb42c9c96b1e0149"
3043
3028
 
3044
3029
  [[package]]
3045
3030
  name = "linux-raw-sys"
@@ -3055,9 +3040,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
3055
3040
 
3056
3041
  [[package]]
3057
3042
  name = "liter-llm"
3058
- version = "1.2.0"
3043
+ version = "1.2.1"
3059
3044
  source = "registry+https://github.com/rust-lang/crates.io-index"
3060
- checksum = "0405bbc5926db49a5f73a4f503d9cac19413416c95e2fd736b1bfa8ce9491660"
3045
+ checksum = "1884be380e45da823105c85ef0fa188af81d57be7de9b65016576e1774fdd5f8"
3061
3046
  dependencies = [
3062
3047
  "base64 0.22.1",
3063
3048
  "bytes",
@@ -3138,11 +3123,11 @@ dependencies = [
3138
3123
 
3139
3124
  [[package]]
3140
3125
  name = "lru"
3141
- version = "0.16.4"
3126
+ version = "0.17.0"
3142
3127
  source = "registry+https://github.com/rust-lang/crates.io-index"
3143
- checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
3128
+ checksum = "0e0b564323a0fb6d54b864f625ae139de9612e27edb944dda37c109f05aac531"
3144
3129
  dependencies = [
3145
- "hashbrown 0.16.1",
3130
+ "hashbrown 0.17.0",
3146
3131
  ]
3147
3132
 
3148
3133
  [[package]]
@@ -3462,6 +3447,15 @@ version = "1.0.6"
3462
3447
  source = "registry+https://github.com/rust-lang/crates.io-index"
3463
3448
  checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
3464
3449
 
3450
+ [[package]]
3451
+ name = "no_std_io2"
3452
+ version = "0.9.3"
3453
+ source = "registry+https://github.com/rust-lang/crates.io-index"
3454
+ checksum = "b51ed7824b6e07d354605f4abb3d9d300350701299da96642ee084f5ce631550"
3455
+ dependencies = [
3456
+ "memchr",
3457
+ ]
3458
+
3465
3459
  [[package]]
3466
3460
  name = "nom"
3467
3461
  version = "7.1.3"
@@ -3900,9 +3894,9 @@ checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
3900
3894
 
3901
3895
  [[package]]
3902
3896
  name = "portable-atomic-util"
3903
- version = "0.2.6"
3897
+ version = "0.2.7"
3904
3898
  source = "registry+https://github.com/rust-lang/crates.io-index"
3905
- checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3"
3899
+ checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618"
3906
3900
  dependencies = [
3907
3901
  "portable-atomic",
3908
3902
  ]
@@ -4014,9 +4008,9 @@ checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae"
4014
4008
 
4015
4009
  [[package]]
4016
4010
  name = "pxfm"
4017
- version = "0.1.28"
4011
+ version = "0.1.29"
4018
4012
  source = "registry+https://github.com/rust-lang/crates.io-index"
4019
- checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d"
4013
+ checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f"
4020
4014
 
4021
4015
  [[package]]
4022
4016
  name = "quick-error"
@@ -4463,9 +4457,9 @@ dependencies = [
4463
4457
 
4464
4458
  [[package]]
4465
4459
  name = "rmcp"
4466
- version = "1.4.0"
4460
+ version = "1.5.0"
4467
4461
  source = "registry+https://github.com/rust-lang/crates.io-index"
4468
- checksum = "f542f74cf247da16f19bbc87e298cd201e912314f4083e88cdd671f44f5fcb53"
4462
+ checksum = "67d69668de0b0ccd9cc435f700f3b39a7861863cf37a15e1f304ea78688a4826"
4469
4463
  dependencies = [
4470
4464
  "async-trait",
4471
4465
  "base64 0.22.1",
@@ -4494,9 +4488,9 @@ dependencies = [
4494
4488
 
4495
4489
  [[package]]
4496
4490
  name = "rmcp-macros"
4497
- version = "1.4.0"
4491
+ version = "1.5.0"
4498
4492
  source = "registry+https://github.com/rust-lang/crates.io-index"
4499
- checksum = "b2391e4ae47f314e70eaafb6c7bd82e495e770b935448864446302143019151f"
4493
+ checksum = "48fdc01c81097b0aed18633e676e269fefa3a78ec1df56b4fe597c1241b92025"
4500
4494
  dependencies = [
4501
4495
  "darling 0.23.0",
4502
4496
  "proc-macro2",
@@ -5117,7 +5111,16 @@ version = "0.27.2"
5117
5111
  source = "registry+https://github.com/rust-lang/crates.io-index"
5118
5112
  checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf"
5119
5113
  dependencies = [
5120
- "strum_macros",
5114
+ "strum_macros 0.27.2",
5115
+ ]
5116
+
5117
+ [[package]]
5118
+ name = "strum"
5119
+ version = "0.28.0"
5120
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5121
+ checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd"
5122
+ dependencies = [
5123
+ "strum_macros 0.28.0",
5121
5124
  ]
5122
5125
 
5123
5126
  [[package]]
@@ -5132,6 +5135,18 @@ dependencies = [
5132
5135
  "syn",
5133
5136
  ]
5134
5137
 
5138
+ [[package]]
5139
+ name = "strum_macros"
5140
+ version = "0.28.0"
5141
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5142
+ checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664"
5143
+ dependencies = [
5144
+ "heck",
5145
+ "proc-macro2",
5146
+ "quote",
5147
+ "syn",
5148
+ ]
5149
+
5135
5150
  [[package]]
5136
5151
  name = "subtle"
5137
5152
  version = "2.6.1"
@@ -5211,9 +5226,9 @@ dependencies = [
5211
5226
 
5212
5227
  [[package]]
5213
5228
  name = "text-splitter"
5214
- version = "0.29.3"
5229
+ version = "0.30.1"
5215
5230
  source = "registry+https://github.com/rust-lang/crates.io-index"
5216
- checksum = "2979ebb41243f6c8adc1c1adb76f35fe4e59ba2bc07f7863bb56bcc838798bf4"
5231
+ checksum = "f1c090dcb5a7e4da833fcd8bdaf7fd5a9596c8fe9fe5c5355960243eaa4b5716"
5217
5232
  dependencies = [
5218
5233
  "ahash",
5219
5234
  "auto_enums",
@@ -5223,7 +5238,7 @@ dependencies = [
5223
5238
  "itertools 0.14.0",
5224
5239
  "memchr",
5225
5240
  "pulldown-cmark",
5226
- "strum",
5241
+ "strum 0.28.0",
5227
5242
  "thiserror 2.0.18",
5228
5243
  "tokenizers",
5229
5244
  ]
@@ -5394,9 +5409,9 @@ dependencies = [
5394
5409
 
5395
5410
  [[package]]
5396
5411
  name = "tokio"
5397
- version = "1.51.1"
5412
+ version = "1.52.1"
5398
5413
  source = "registry+https://github.com/rust-lang/crates.io-index"
5399
- checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c"
5414
+ checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6"
5400
5415
  dependencies = [
5401
5416
  "bytes",
5402
5417
  "libc",
@@ -5674,9 +5689,9 @@ checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782"
5674
5689
 
5675
5690
  [[package]]
5676
5691
  name = "tree-sitter-language-pack"
5677
- version = "1.4.2"
5692
+ version = "1.6.2"
5678
5693
  source = "registry+https://github.com/rust-lang/crates.io-index"
5679
- checksum = "6ed9cacce88ea8b3a92813649012c1ef387bc864f1dd8398843e8b9e076233b0"
5694
+ checksum = "104c982ba77c77942e81029458eb44524b62cc6f4ddb62ce1397420a08ff3c3b"
5680
5695
  dependencies = [
5681
5696
  "ahash",
5682
5697
  "cc",
@@ -5719,9 +5734,9 @@ checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
5719
5734
 
5720
5735
  [[package]]
5721
5736
  name = "typenum"
5722
- version = "1.19.0"
5737
+ version = "1.20.0"
5723
5738
  source = "registry+https://github.com/rust-lang/crates.io-index"
5724
- checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
5739
+ checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
5725
5740
 
5726
5741
  [[package]]
5727
5742
  name = "unicase"
@@ -5853,7 +5868,7 @@ dependencies = [
5853
5868
  "ureq-proto",
5854
5869
  "utf8-zero",
5855
5870
  "webpki-root-certs",
5856
- "webpki-roots 1.0.6",
5871
+ "webpki-roots 1.0.7",
5857
5872
  ]
5858
5873
 
5859
5874
  [[package]]
@@ -5951,9 +5966,9 @@ dependencies = [
5951
5966
 
5952
5967
  [[package]]
5953
5968
  name = "uuid"
5954
- version = "1.23.0"
5969
+ version = "1.23.1"
5955
5970
  source = "registry+https://github.com/rust-lang/crates.io-index"
5956
- checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9"
5971
+ checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76"
5957
5972
  dependencies = [
5958
5973
  "getrandom 0.4.2",
5959
5974
  "js-sys",
@@ -6031,11 +6046,11 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
6031
6046
 
6032
6047
  [[package]]
6033
6048
  name = "wasip2"
6034
- version = "1.0.2+wasi-0.2.9"
6049
+ version = "1.0.3+wasi-0.2.9"
6035
6050
  source = "registry+https://github.com/rust-lang/crates.io-index"
6036
- checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
6051
+ checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
6037
6052
  dependencies = [
6038
- "wit-bindgen",
6053
+ "wit-bindgen 0.57.1",
6039
6054
  ]
6040
6055
 
6041
6056
  [[package]]
@@ -6044,7 +6059,7 @@ version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
6044
6059
  source = "registry+https://github.com/rust-lang/crates.io-index"
6045
6060
  checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
6046
6061
  dependencies = [
6047
- "wit-bindgen",
6062
+ "wit-bindgen 0.51.0",
6048
6063
  ]
6049
6064
 
6050
6065
  [[package]]
@@ -6171,9 +6186,9 @@ dependencies = [
6171
6186
 
6172
6187
  [[package]]
6173
6188
  name = "web_atoms"
6174
- version = "0.2.3"
6189
+ version = "0.2.4"
6175
6190
  source = "registry+https://github.com/rust-lang/crates.io-index"
6176
- checksum = "57a9779e9f04d2ac1ce317aee707aa2f6b773afba7b931222bff6983843b1576"
6191
+ checksum = "d7cff6eef815df1834fd250e3a2ff436044d82a9f1bc1980ca1dbdf07effc538"
6177
6192
  dependencies = [
6178
6193
  "phf",
6179
6194
  "phf_codegen",
@@ -6183,9 +6198,9 @@ dependencies = [
6183
6198
 
6184
6199
  [[package]]
6185
6200
  name = "webpki-root-certs"
6186
- version = "1.0.6"
6201
+ version = "1.0.7"
6187
6202
  source = "registry+https://github.com/rust-lang/crates.io-index"
6188
- checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
6203
+ checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
6189
6204
  dependencies = [
6190
6205
  "rustls-pki-types",
6191
6206
  ]
@@ -6196,14 +6211,14 @@ version = "0.26.11"
6196
6211
  source = "registry+https://github.com/rust-lang/crates.io-index"
6197
6212
  checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
6198
6213
  dependencies = [
6199
- "webpki-roots 1.0.6",
6214
+ "webpki-roots 1.0.7",
6200
6215
  ]
6201
6216
 
6202
6217
  [[package]]
6203
6218
  name = "webpki-roots"
6204
- version = "1.0.6"
6219
+ version = "1.0.7"
6205
6220
  source = "registry+https://github.com/rust-lang/crates.io-index"
6206
- checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed"
6221
+ checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d"
6207
6222
  dependencies = [
6208
6223
  "rustls-pki-types",
6209
6224
  ]
@@ -6575,6 +6590,12 @@ dependencies = [
6575
6590
  "wit-bindgen-rust-macro",
6576
6591
  ]
6577
6592
 
6593
+ [[package]]
6594
+ name = "wit-bindgen"
6595
+ version = "0.57.1"
6596
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6597
+ checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
6598
+
6578
6599
  [[package]]
6579
6600
  name = "wit-bindgen-core"
6580
6601
  version = "0.51.0"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.8.5"
3
+ version = "4.9.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -55,7 +55,7 @@ rb-sys = { version = "0.9.126", default-features = false, features = [
55
55
  serde_json = "1.0.149"
56
56
  toml = "1.1.2"
57
57
  serde_yaml_ng = "0.10"
58
- tokio = { version = "1.51.1", features = [
58
+ tokio = { version = "1.52.1", features = [
59
59
  "rt",
60
60
  "rt-multi-thread",
61
61
  "macros",
@@ -65,7 +65,7 @@ tokio = { version = "1.51.1", features = [
65
65
  "time",
66
66
  "io-util",
67
67
  ] }
68
- html-to-markdown-rs = { version = "3.1.0", default-features = false }
68
+ html-to-markdown-rs = { version = "3.2.5", default-features = false }
69
69
 
70
70
  [dev-dependencies]
71
71
  pretty_assertions = "1.4"
@@ -42,6 +42,7 @@ pub fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
42
42
  };
43
43
 
44
44
  let mut config = OcrConfig {
45
+ enabled: true,
45
46
  backend,
46
47
  language,
47
48
  paddle_ocr_config: None,
@@ -136,6 +137,8 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
136
137
  {
137
138
  match symbol_to_string(val)?.as_str() {
138
139
  "markdown" => kreuzberg::ChunkerType::Markdown,
140
+ "yaml" => kreuzberg::ChunkerType::Yaml,
141
+ "semantic" => kreuzberg::ChunkerType::Semantic,
139
142
  _ => kreuzberg::ChunkerType::Text,
140
143
  }
141
144
  } else {
@@ -150,6 +153,14 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
150
153
  false
151
154
  };
152
155
 
156
+ let topic_threshold = if let Some(val) = get_kw(ruby, hash, "topic_threshold")
157
+ && val.equal(ruby.qnil()).ok() != Some(true)
158
+ {
159
+ Some(f64::try_convert(val)? as f32)
160
+ } else {
161
+ None
162
+ };
163
+
153
164
  let config = ChunkingConfig {
154
165
  max_characters: max_chars,
155
166
  overlap: max_overlap,
@@ -159,6 +170,7 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
159
170
  preset,
160
171
  sizing,
161
172
  prepend_heading_context,
173
+ topic_threshold,
162
174
  };
163
175
 
164
176
  Ok(config)
@@ -840,10 +852,20 @@ pub fn parse_layout_detection_config(ruby: &Ruby, hash: RHash) -> Result<LayoutD
840
852
  kreuzberg::core::config::layout::TableModel::default()
841
853
  };
842
854
 
855
+ let acceleration = if let Some(val) = get_kw(ruby, hash, "acceleration")
856
+ && val.equal(ruby.qnil()).ok() != Some(true)
857
+ {
858
+ let accel_hash = RHash::try_convert(val)?;
859
+ Some(parse_acceleration_config(ruby, accel_hash)?)
860
+ } else {
861
+ None
862
+ };
863
+
843
864
  let config = LayoutDetectionConfig {
844
865
  confidence_threshold,
845
866
  apply_heuristics,
846
867
  table_model,
868
+ acceleration,
847
869
  };
848
870
 
849
871
  Ok(config)
@@ -108,6 +108,13 @@ pub fn kreuzberg_error(err: KreuzbergError) -> Error {
108
108
  Error::new(ruby.exception_runtime_error(), format!("EmbeddingError: {}", message))
109
109
  }
110
110
  }
111
+ KreuzbergError::Cancelled => {
112
+ if let Some(class) = fetch_error_class("CancelledError") {
113
+ Error::new(class, "Extraction cancelled")
114
+ } else {
115
+ Error::new(ruby.exception_runtime_error(), "Extraction cancelled")
116
+ }
117
+ }
111
118
  other => Error::new(ruby.exception_runtime_error(), other.to_string()),
112
119
  }
113
120
  }
@@ -751,6 +751,13 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
751
751
  set_hash_entry(ruby, &hash, "llm_usage", ruby.qnil().as_value())?;
752
752
  }
753
753
 
754
+ // Convert structured output (Value::Null maps to qnil via json_value_to_ruby)
755
+ let structured_ruby = match &result.structured_output {
756
+ Some(val) => json_value_to_ruby(ruby, val)?,
757
+ None => ruby.qnil().as_value(),
758
+ };
759
+ set_hash_entry(ruby, &hash, "structured_output", structured_ruby)?;
760
+
754
761
  // Convert annotations
755
762
  if let Some(annotations) = result.annotations {
756
763
  let annotations_array = ruby.ary_new();