kreuzberg 4.5.1 → 4.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +4 -4
  4. data/ext/kreuzberg_rb/native/Cargo.lock +90 -47
  5. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +23 -9
  7. data/lib/kreuzberg/config.rb +25 -8
  8. data/lib/kreuzberg/version.rb +1 -1
  9. data/sig/kreuzberg.rbs +7 -2
  10. data/vendor/Cargo.toml +6 -5
  11. data/vendor/kreuzberg/Cargo.toml +144 -111
  12. data/vendor/kreuzberg/README.md +1 -1
  13. data/vendor/kreuzberg/src/api/handlers.rs +483 -2
  14. data/vendor/kreuzberg/src/api/mod.rs +7 -2
  15. data/vendor/kreuzberg/src/api/openapi.rs +19 -0
  16. data/vendor/kreuzberg/src/api/router.rs +7 -3
  17. data/vendor/kreuzberg/src/api/types.rs +75 -0
  18. data/vendor/kreuzberg/src/cache/core.rs +223 -122
  19. data/vendor/kreuzberg/src/cache/mod.rs +20 -16
  20. data/vendor/kreuzberg/src/cache/utilities.rs +62 -44
  21. data/vendor/kreuzberg/src/chunking/core.rs +47 -0
  22. data/vendor/kreuzberg/src/core/config/extraction/core.rs +18 -0
  23. data/vendor/kreuzberg/src/core/config/layout.rs +12 -0
  24. data/vendor/kreuzberg/src/core/config/processing.rs +19 -0
  25. data/vendor/kreuzberg/src/core/extractor/file.rs +79 -0
  26. data/vendor/kreuzberg/src/core/mime.rs +42 -1
  27. data/vendor/kreuzberg/src/extraction/hwp/error.rs +54 -0
  28. data/vendor/kreuzberg/src/extraction/hwp/mod.rs +72 -0
  29. data/vendor/kreuzberg/src/extraction/hwp/model.rs +102 -0
  30. data/vendor/kreuzberg/src/extraction/hwp/parser.rs +174 -0
  31. data/vendor/kreuzberg/src/extraction/hwp/reader.rs +126 -0
  32. data/vendor/kreuzberg/src/extraction/mod.rs +3 -0
  33. data/vendor/kreuzberg/src/extractors/epub/content.rs +58 -7
  34. data/vendor/kreuzberg/src/extractors/epub/mod.rs +19 -5
  35. data/vendor/kreuzberg/src/extractors/hwp.rs +4 -5
  36. data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +189 -0
  37. data/vendor/kreuzberg/src/extractors/iwork/mod.rs +291 -0
  38. data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +186 -0
  39. data/vendor/kreuzberg/src/extractors/iwork/pages.rs +182 -0
  40. data/vendor/kreuzberg/src/extractors/mod.rs +13 -0
  41. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +18 -3
  42. data/vendor/kreuzberg/src/layout/engine.rs +3 -0
  43. data/vendor/kreuzberg/src/layout/mod.rs +133 -0
  44. data/vendor/kreuzberg/src/layout/model_manager.rs +61 -2
  45. data/vendor/kreuzberg/src/layout/models/mod.rs +2 -0
  46. data/vendor/kreuzberg/src/layout/models/slanet.rs +550 -0
  47. data/vendor/kreuzberg/src/layout/models/table_classifier.rs +219 -0
  48. data/vendor/kreuzberg/src/mcp/mod.rs +9 -1
  49. data/vendor/kreuzberg/src/mcp/params.rs +87 -0
  50. data/vendor/kreuzberg/src/mcp/server.rs +585 -5
  51. data/vendor/kreuzberg/src/ocr/cache.rs +1 -1
  52. data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
  53. data/vendor/kreuzberg/src/ocr/processor/config.rs +21 -23
  54. data/vendor/kreuzberg/src/ocr/processor/execution.rs +6 -25
  55. data/vendor/kreuzberg/src/ocr/processor/validation.rs +29 -9
  56. data/vendor/kreuzberg/src/ocr/tessdata_manager.rs +254 -0
  57. data/vendor/kreuzberg/src/ocr/utils.rs +6 -10
  58. data/vendor/kreuzberg/src/pdf/images.rs +13 -0
  59. data/vendor/kreuzberg/src/pdf/layout_runner.rs +11 -0
  60. data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +9 -1
  61. data/vendor/kreuzberg/src/pdf/markdown/classify.rs +98 -6
  62. data/vendor/kreuzberg/src/pdf/markdown/mod.rs +1 -1
  63. data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +273 -51
  64. data/vendor/kreuzberg/src/pdf/markdown/regions/mod.rs +2 -0
  65. data/vendor/kreuzberg/src/pdf/markdown/regions/table_recognition.rs +334 -1
  66. data/vendor/kreuzberg/src/pdf/markdown/regions/tables.rs +11 -1
  67. data/vendor/kreuzberg/src/pdf/markdown/render.rs +22 -16
  68. data/vendor/kreuzberg/src/pdf/markdown/text_repair.rs +209 -47
  69. data/vendor/kreuzberg/src/pdf/oxide_text.rs +10 -1
  70. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  71. data/vendor/kreuzberg/src/pdf/text_data.rs +15 -6
  72. data/vendor/kreuzberg/tests/epub_markdown_headings_tests.rs +177 -0
  73. data/vendor/kreuzberg/tests/instrumentation_test.rs +2 -2
  74. data/vendor/kreuzberg/tests/iwork_integration.rs +220 -0
  75. data/vendor/kreuzberg-ffi/Cargo.toml +14 -14
  76. data/vendor/kreuzberg-ffi/kreuzberg.h +46 -2
  77. data/vendor/kreuzberg-ffi/src/config_builder.rs +81 -0
  78. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +14 -14
  79. data/vendor/kreuzberg-pdfium-render/Cargo.toml +17 -17
  80. data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text/segment.rs +13 -0
  81. data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text.rs +148 -0
  82. data/vendor/kreuzberg-tesseract/Cargo.toml +27 -27
  83. data/vendor/kreuzberg-tesseract/build.rs +61 -0
  84. metadata +16 -6
  85. data/vendor/kreuzberg/src/mcp/tools/cache.rs +0 -179
  86. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +0 -431
  87. data/vendor/kreuzberg/src/mcp/tools/mime.rs +0 -150
  88. data/vendor/kreuzberg/src/mcp/tools/mod.rs +0 -11
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9d245ca9cadfb5b07cab9c8709854cfd0eb488684b6c5fbc6866b891f162b0d0
4
- data.tar.gz: 31ab57a13cef6881bc58c52058b1ade5ca13af00ed5d81df1fc79853ca816e8d
3
+ metadata.gz: 47a14cc623891453596552fd893b43f18ffa04068de61b47b34b7f18ad8af890
4
+ data.tar.gz: 5e18a52f5acbabba2ee64790b3831c89301e58043c8b22ba7616791a2338401e
5
5
  SHA512:
6
- metadata.gz: 5cada46dd61ecb89dd9a7ffdeeb5df86ca338b5864785d80753e12f0967d23e7183360dde623732c8579e9ba78a9b8ff26bdc978ee744a842a75c33bb3877784
7
- data.tar.gz: 12b1b780c4065379cb7d0fde912bebfac9ae5ea48a33600e4331d705c65d58a05393fa42902e0d1d7b0d5dd2ba98b1dd231fc29a9819a60cb1a8f398146e1b9d
6
+ metadata.gz: 9ac2251c79bcdff41d8746ea3244bf3774cd022aa0f9c63a0153a7ec394f86d1bfd4fe3b364d175371f602006aafea17f612250b2b8368e213989b910f17940e
7
+ data.tar.gz: 8494619ec2253eaeb68b95fc4e204355ab2a08f71f5863f88b8305c7c07333a456f73c497804de7ca7693b1207bd0afaef3e6ca0ab5cb811eee1686823bff31d
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.5.1)
4
+ kreuzberg (4.5.4)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -222,7 +222,7 @@ CHECKSUMS
222
222
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
223
223
  json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
224
224
  json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
225
- kreuzberg (4.5.1)
225
+ kreuzberg (4.5.4)
226
226
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
227
227
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
228
228
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.1" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.4" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -61,7 +61,7 @@
61
61
  </div>
62
62
 
63
63
 
64
- Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
64
+ Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
65
65
 
66
66
 
67
67
  ## Installation
@@ -211,9 +211,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
211
211
 
212
212
  ## Features
213
213
 
214
- ### Supported File Formats (88+)
214
+ ### Supported File Formats (91+)
215
215
 
216
- 88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
216
+ 91+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
217
217
 
218
218
  #### Office Documents
219
219
 
@@ -161,6 +161,12 @@ dependencies = [
161
161
  "syn",
162
162
  ]
163
163
 
164
+ [[package]]
165
+ name = "arrayref"
166
+ version = "0.3.9"
167
+ source = "registry+https://github.com/rust-lang/crates.io-index"
168
+ checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
169
+
164
170
  [[package]]
165
171
  name = "arrayvec"
166
172
  version = "0.7.6"
@@ -473,6 +479,20 @@ dependencies = [
473
479
  "wyz",
474
480
  ]
475
481
 
482
+ [[package]]
483
+ name = "blake3"
484
+ version = "1.8.3"
485
+ source = "registry+https://github.com/rust-lang/crates.io-index"
486
+ checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d"
487
+ dependencies = [
488
+ "arrayref",
489
+ "arrayvec",
490
+ "cc",
491
+ "cfg-if",
492
+ "constant_time_eq 0.4.2",
493
+ "cpufeatures 0.2.17",
494
+ ]
495
+
476
496
  [[package]]
477
497
  name = "block-buffer"
478
498
  version = "0.10.4"
@@ -916,6 +936,12 @@ version = "0.3.1"
916
936
  source = "registry+https://github.com/rust-lang/crates.io-index"
917
937
  checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
918
938
 
939
+ [[package]]
940
+ name = "constant_time_eq"
941
+ version = "0.4.2"
942
+ source = "registry+https://github.com/rust-lang/crates.io-index"
943
+ checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
944
+
919
945
  [[package]]
920
946
  name = "cookie"
921
947
  version = "0.18.1"
@@ -1189,15 +1215,15 @@ checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2"
1189
1215
 
1190
1216
  [[package]]
1191
1217
  name = "deflate64"
1192
- version = "0.1.11"
1218
+ version = "0.1.12"
1193
1219
  source = "registry+https://github.com/rust-lang/crates.io-index"
1194
- checksum = "807800ff3288b621186fe0a8f3392c4652068257302709c24efd918c3dffcdc2"
1220
+ checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
1195
1221
 
1196
1222
  [[package]]
1197
1223
  name = "der"
1198
- version = "0.7.10"
1224
+ version = "0.8.0"
1199
1225
  source = "registry+https://github.com/rust-lang/crates.io-index"
1200
- checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
1226
+ checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b"
1201
1227
  dependencies = [
1202
1228
  "pem-rfc7468",
1203
1229
  "zeroize",
@@ -2001,7 +2027,7 @@ dependencies = [
2001
2027
  "serde",
2002
2028
  "serde_json",
2003
2029
  "thiserror 2.0.18",
2004
- "ureq 3.2.1",
2030
+ "ureq 3.3.0",
2005
2031
  "windows-sys 0.61.2",
2006
2032
  ]
2007
2033
 
@@ -2031,9 +2057,9 @@ dependencies = [
2031
2057
 
2032
2058
  [[package]]
2033
2059
  name = "html-to-markdown-rs"
2034
- version = "2.28.6"
2060
+ version = "2.29.0"
2035
2061
  source = "registry+https://github.com/rust-lang/crates.io-index"
2036
- checksum = "6869b5e058b5ebb8c176269406b692d0695b4b19c36e532b56a2c355590978ae"
2062
+ checksum = "9013679b8c3600142e5a8f742748c3c38c49d9fc50675dad62f8f1721090a85a"
2037
2063
  dependencies = [
2038
2064
  "ahash",
2039
2065
  "astral-tl",
@@ -2643,9 +2669,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
2643
2669
 
2644
2670
  [[package]]
2645
2671
  name = "iri-string"
2646
- version = "0.7.10"
2672
+ version = "0.7.11"
2647
2673
  source = "registry+https://github.com/rust-lang/crates.io-index"
2648
- checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a"
2674
+ checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb"
2649
2675
  dependencies = [
2650
2676
  "memchr",
2651
2677
  "serde",
@@ -2731,7 +2757,7 @@ dependencies = [
2731
2757
  "cesu8",
2732
2758
  "cfg-if",
2733
2759
  "combine",
2734
- "jni-sys",
2760
+ "jni-sys 0.3.1",
2735
2761
  "log",
2736
2762
  "thiserror 1.0.69",
2737
2763
  "walkdir",
@@ -2740,9 +2766,31 @@ dependencies = [
2740
2766
 
2741
2767
  [[package]]
2742
2768
  name = "jni-sys"
2743
- version = "0.3.0"
2769
+ version = "0.3.1"
2770
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2771
+ checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258"
2772
+ dependencies = [
2773
+ "jni-sys 0.4.1",
2774
+ ]
2775
+
2776
+ [[package]]
2777
+ name = "jni-sys"
2778
+ version = "0.4.1"
2779
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2780
+ checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2"
2781
+ dependencies = [
2782
+ "jni-sys-macros",
2783
+ ]
2784
+
2785
+ [[package]]
2786
+ name = "jni-sys-macros"
2787
+ version = "0.4.1"
2744
2788
  source = "registry+https://github.com/rust-lang/crates.io-index"
2745
- checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
2789
+ checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264"
2790
+ dependencies = [
2791
+ "quote",
2792
+ "syn",
2793
+ ]
2746
2794
 
2747
2795
  [[package]]
2748
2796
  name = "jobserver"
@@ -2781,7 +2829,7 @@ dependencies = [
2781
2829
 
2782
2830
  [[package]]
2783
2831
  name = "kreuzberg"
2784
- version = "4.5.1"
2832
+ version = "4.5.3"
2785
2833
  dependencies = [
2786
2834
  "ahash",
2787
2835
  "async-trait",
@@ -2790,6 +2838,7 @@ dependencies = [
2790
2838
  "biblatex",
2791
2839
  "biblib",
2792
2840
  "bitvec",
2841
+ "blake3",
2793
2842
  "bytes",
2794
2843
  "calamine",
2795
2844
  "cfb 0.14.0",
@@ -2843,6 +2892,7 @@ dependencies = [
2843
2892
  "serde_yaml_ng",
2844
2893
  "sevenz-rust2",
2845
2894
  "sha2",
2895
+ "snap",
2846
2896
  "tar",
2847
2897
  "text-splitter",
2848
2898
  "thiserror 2.0.18",
@@ -2855,16 +2905,16 @@ dependencies = [
2855
2905
  "tracing",
2856
2906
  "tracing-opentelemetry",
2857
2907
  "unicode-normalization",
2858
- "ureq 3.2.1",
2908
+ "ureq 3.3.0",
2859
2909
  "utoipa",
2860
2910
  "whatlang",
2861
2911
  "yake-rust",
2862
- "zip 8.3.0",
2912
+ "zip 7.2.0",
2863
2913
  ]
2864
2914
 
2865
2915
  [[package]]
2866
2916
  name = "kreuzberg-ffi"
2867
- version = "4.5.1"
2917
+ version = "4.5.3"
2868
2918
  dependencies = [
2869
2919
  "ahash",
2870
2920
  "async-trait",
@@ -2880,7 +2930,7 @@ dependencies = [
2880
2930
 
2881
2931
  [[package]]
2882
2932
  name = "kreuzberg-paddle-ocr"
2883
- version = "4.5.1"
2933
+ version = "4.5.3"
2884
2934
  dependencies = [
2885
2935
  "geo-clipper",
2886
2936
  "geo-types",
@@ -2894,7 +2944,7 @@ dependencies = [
2894
2944
 
2895
2945
  [[package]]
2896
2946
  name = "kreuzberg-pdfium-render"
2897
- version = "4.5.1"
2947
+ version = "4.5.3"
2898
2948
  dependencies = [
2899
2949
  "bitflags",
2900
2950
  "bytemuck",
@@ -2917,7 +2967,7 @@ dependencies = [
2917
2967
 
2918
2968
  [[package]]
2919
2969
  name = "kreuzberg-rb"
2920
- version = "4.5.1"
2970
+ version = "4.5.3"
2921
2971
  dependencies = [
2922
2972
  "async-trait",
2923
2973
  "html-to-markdown-rs",
@@ -2934,13 +2984,13 @@ dependencies = [
2934
2984
 
2935
2985
  [[package]]
2936
2986
  name = "kreuzberg-tesseract"
2937
- version = "4.5.1"
2987
+ version = "4.5.3"
2938
2988
  dependencies = [
2939
2989
  "cc",
2940
2990
  "cmake",
2941
2991
  "reqwest",
2942
2992
  "thiserror 2.0.18",
2943
- "zip 8.3.0",
2993
+ "zip 7.2.0",
2944
2994
  ]
2945
2995
 
2946
2996
  [[package]]
@@ -3712,7 +3762,7 @@ dependencies = [
3712
3762
  "ort-sys",
3713
3763
  "smallvec",
3714
3764
  "tracing",
3715
- "ureq 3.2.1",
3765
+ "ureq 3.3.0",
3716
3766
  ]
3717
3767
 
3718
3768
  [[package]]
@@ -3723,7 +3773,7 @@ checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90"
3723
3773
  dependencies = [
3724
3774
  "hmac-sha256",
3725
3775
  "lzma-rust2 0.15.7",
3726
- "ureq 3.2.1",
3776
+ "ureq 3.3.0",
3727
3777
  ]
3728
3778
 
3729
3779
  [[package]]
@@ -3779,9 +3829,9 @@ dependencies = [
3779
3829
 
3780
3830
  [[package]]
3781
3831
  name = "pem-rfc7468"
3782
- version = "0.7.0"
3832
+ version = "1.0.0"
3783
3833
  source = "registry+https://github.com/rust-lang/crates.io-index"
3784
- checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
3834
+ checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9"
3785
3835
  dependencies = [
3786
3836
  "base64ct",
3787
3837
  ]
@@ -3977,9 +4027,9 @@ dependencies = [
3977
4027
 
3978
4028
  [[package]]
3979
4029
  name = "pulldown-cmark"
3980
- version = "0.13.2"
4030
+ version = "0.13.3"
3981
4031
  source = "registry+https://github.com/rust-lang/crates.io-index"
3982
- checksum = "14104c5a24d9bcf7eb2c24753e0f49fe14555d8bd565ea3d38e4b4303267259d"
4032
+ checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad"
3983
4033
  dependencies = [
3984
4034
  "bitflags",
3985
4035
  "getopts",
@@ -4952,6 +5002,12 @@ version = "1.15.1"
4952
5002
  source = "registry+https://github.com/rust-lang/crates.io-index"
4953
5003
  checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
4954
5004
 
5005
+ [[package]]
5006
+ name = "snap"
5007
+ version = "1.1.1"
5008
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5009
+ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
5010
+
4955
5011
  [[package]]
4956
5012
  name = "socket2"
4957
5013
  version = "0.6.3"
@@ -5745,9 +5801,9 @@ dependencies = [
5745
5801
 
5746
5802
  [[package]]
5747
5803
  name = "ureq"
5748
- version = "3.2.1"
5804
+ version = "3.3.0"
5749
5805
  source = "registry+https://github.com/rust-lang/crates.io-index"
5750
- checksum = "4ab5172ab0c2b6d01a9bb4f9332f7c1211193ea002742188040d09ea4eafe867"
5806
+ checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
5751
5807
  dependencies = [
5752
5808
  "base64 0.22.1",
5753
5809
  "cookie_store",
@@ -5769,9 +5825,9 @@ dependencies = [
5769
5825
 
5770
5826
  [[package]]
5771
5827
  name = "ureq-proto"
5772
- version = "0.5.3"
5828
+ version = "0.6.0"
5773
5829
  source = "registry+https://github.com/rust-lang/crates.io-index"
5774
- checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f"
5830
+ checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
5775
5831
  dependencies = [
5776
5832
  "base64 0.22.1",
5777
5833
  "http",
@@ -6784,7 +6840,7 @@ dependencies = [
6784
6840
  "aes",
6785
6841
  "arbitrary",
6786
6842
  "bzip2 0.5.2",
6787
- "constant_time_eq",
6843
+ "constant_time_eq 0.3.1",
6788
6844
  "crc32fast",
6789
6845
  "crossbeam-utils",
6790
6846
  "deflate64",
@@ -6819,19 +6875,6 @@ dependencies = [
6819
6875
  "zopfli",
6820
6876
  ]
6821
6877
 
6822
- [[package]]
6823
- name = "zip"
6824
- version = "8.3.0"
6825
- source = "registry+https://github.com/rust-lang/crates.io-index"
6826
- checksum = "4a243cfad17427fc077f529da5a95abe4e94fd2bfdb601611870a6557cc67657"
6827
- dependencies = [
6828
- "crc32fast",
6829
- "flate2",
6830
- "indexmap",
6831
- "memchr",
6832
- "typed-path",
6833
- ]
6834
-
6835
6878
  [[package]]
6836
6879
  name = "zlib-rs"
6837
6880
  version = "0.6.3"
@@ -6901,9 +6944,9 @@ dependencies = [
6901
6944
 
6902
6945
  [[package]]
6903
6946
  name = "zune-jpeg"
6904
- version = "0.5.13"
6947
+ version = "0.5.14"
6905
6948
  source = "registry+https://github.com/rust-lang/crates.io-index"
6906
- checksum = "ec5f41c76397b7da451efd19915684f727d7e1d516384ca6bd0ec43ec94de23c"
6949
+ checksum = "0b7a1c0af6e5d8d1363f4994b7a091ccf963d8b694f7da5b0b9cceb82da2c0a6"
6907
6950
  dependencies = [
6908
6951
  "zune-core",
6909
6952
  ]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.5.1"
3
+ version = "4.5.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -64,7 +64,7 @@ tokio = { version = "1.50.0", features = [
64
64
  "time",
65
65
  "io-util",
66
66
  ] }
67
- html-to-markdown-rs = { version = "2.28.6", default-features = false }
67
+ html-to-markdown-rs = { version = "2.29.0", default-features = false }
68
68
 
69
69
  [dev-dependencies]
70
70
  pretty_assertions = "1.4"
@@ -139,6 +139,12 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
139
139
 
140
140
  let sizing = parse_chunk_sizing(ruby, hash)?;
141
141
 
142
+ let prepend_heading_context = if let Some(val) = get_kw(ruby, hash, "prepend_heading_context") {
143
+ bool::try_convert(val)?
144
+ } else {
145
+ false
146
+ };
147
+
142
148
  let config = ChunkingConfig {
143
149
  max_characters: max_chars,
144
150
  overlap: max_overlap,
@@ -147,6 +153,7 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
147
153
  embedding,
148
154
  preset,
149
155
  sizing,
156
+ prepend_heading_context,
150
157
  };
151
158
 
152
159
  Ok(config)
@@ -315,12 +322,11 @@ pub fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
315
322
  None
316
323
  };
317
324
 
318
- let allow_single_column_tables =
319
- if let Some(val) = get_kw(ruby, hash, "allow_single_column_tables") {
320
- bool::try_convert(val)?
321
- } else {
322
- false
323
- };
325
+ let allow_single_column_tables = if let Some(val) = get_kw(ruby, hash, "allow_single_column_tables") {
326
+ bool::try_convert(val)?
327
+ } else {
328
+ false
329
+ };
324
330
 
325
331
  let config = PdfConfig {
326
332
  extract_images,
@@ -819,10 +825,19 @@ pub fn parse_layout_detection_config(ruby: &Ruby, hash: RHash) -> Result<LayoutD
819
825
  true
820
826
  };
821
827
 
828
+ let table_model = if let Some(val) = get_kw(ruby, hash, "table_model")
829
+ && val.equal(ruby.qnil()).ok() != Some(true)
830
+ {
831
+ Some(String::try_convert(val)?)
832
+ } else {
833
+ None
834
+ };
835
+
822
836
  let config = LayoutDetectionConfig {
823
837
  preset,
824
838
  confidence_threshold,
825
839
  apply_heuristics,
840
+ table_model,
826
841
  };
827
842
 
828
843
  Ok(config)
@@ -952,9 +967,8 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
952
967
  && val.equal(ruby.qnil()).ok() != Some(true)
953
968
  {
954
969
  let security_json = ruby_value_to_json(val)?;
955
- let parsed: kreuzberg::extractors::security::SecurityLimits =
956
- serde_json::from_value(security_json)
957
- .map_err(|e| runtime_error(format!("Invalid security_limits: {}", e)))?;
970
+ let parsed: kreuzberg::extractors::security::SecurityLimits = serde_json::from_value(security_json)
971
+ .map_err(|e| runtime_error(format!("Invalid security_limits: {}", e)))?;
958
972
  config.security_limits = Some(parsed);
959
973
  }
960
974
 
@@ -850,19 +850,21 @@ module Kreuzberg
850
850
  # )
851
851
  #
852
852
  class LayoutDetection
853
- attr_reader :preset, :confidence_threshold, :apply_heuristics
853
+ attr_reader :preset, :confidence_threshold, :apply_heuristics, :table_model
854
854
 
855
- def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true)
855
+ def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true, table_model: nil)
856
856
  @preset = preset.to_s
857
857
  @confidence_threshold = confidence_threshold&.to_f
858
858
  @apply_heuristics = apply_heuristics ? true : false
859
+ @table_model = table_model&.to_s
859
860
  end
860
861
 
861
862
  def to_h
862
863
  {
863
864
  preset: @preset,
864
865
  confidence_threshold: @confidence_threshold,
865
- apply_heuristics: @apply_heuristics
866
+ apply_heuristics: @apply_heuristics,
867
+ table_model: @table_model
866
868
  }.compact
867
869
  end
868
870
  end
@@ -930,7 +932,8 @@ module Kreuzberg
930
932
  :images, :postprocessor,
931
933
  :token_reduction, :keywords, :html_options, :pages,
932
934
  :max_concurrent_extractions, :output_format, :result_format,
933
- :security_limits, :layout, :concurrency
935
+ :security_limits, :layout, :concurrency,
936
+ :cache_namespace, :cache_ttl_secs
934
937
 
935
938
  # Alias for backward compatibility - image_extraction is the canonical name
936
939
  alias image_extraction images
@@ -955,7 +958,7 @@ module Kreuzberg
955
958
  language_detection pdf_options image_extraction
956
959
  postprocessor token_reduction keywords html_options pages
957
960
  max_concurrent_extractions output_format result_format
958
- security_limits layout concurrency
961
+ security_limits layout concurrency cache_namespace cache_ttl_secs
959
962
  ].freeze
960
963
 
961
964
  # Aliases for backward compatibility
@@ -1032,7 +1035,9 @@ module Kreuzberg
1032
1035
  result_format: nil,
1033
1036
  security_limits: nil,
1034
1037
  layout: nil,
1035
- concurrency: nil)
1038
+ concurrency: nil,
1039
+ cache_namespace: nil,
1040
+ cache_ttl_secs: nil)
1036
1041
  kwargs = {
1037
1042
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
1038
1043
  force_ocr: force_ocr, include_document_structure: include_document_structure,
@@ -1043,7 +1048,9 @@ module Kreuzberg
1043
1048
  pages: pages, max_concurrent_extractions: max_concurrent_extractions,
1044
1049
  output_format: output_format, result_format: result_format,
1045
1050
  security_limits: security_limits, layout: layout,
1046
- concurrency: concurrency
1051
+ concurrency: concurrency,
1052
+ cache_namespace: cache_namespace,
1053
+ cache_ttl_secs: cache_ttl_secs
1047
1054
  }
1048
1055
  extracted = extract_from_hash(hash, kwargs)
1049
1056
 
@@ -1077,6 +1084,8 @@ module Kreuzberg
1077
1084
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
1078
1085
  @output_format = validate_output_format(params[:output_format])
1079
1086
  @result_format = validate_result_format(params[:result_format])
1087
+ @cache_namespace = params[:cache_namespace]
1088
+ @cache_ttl_secs = params[:cache_ttl_secs]&.to_i
1080
1089
  @security_limits = params[:security_limits]
1081
1090
  end
1082
1091
 
@@ -1112,7 +1121,9 @@ module Kreuzberg
1112
1121
  include_document_structure: @include_document_structure,
1113
1122
  max_concurrent_extractions: @max_concurrent_extractions,
1114
1123
  output_format: @output_format,
1115
- result_format: @result_format
1124
+ result_format: @result_format,
1125
+ cache_namespace: @cache_namespace,
1126
+ cache_ttl_secs: @cache_ttl_secs
1116
1127
  }
1117
1128
  end
1118
1129
 
@@ -1271,6 +1282,10 @@ module Kreuzberg
1271
1282
  @output_format = validate_output_format(value)
1272
1283
  when :result_format
1273
1284
  @result_format = validate_result_format(value)
1285
+ when :cache_namespace
1286
+ @cache_namespace = value
1287
+ when :cache_ttl_secs
1288
+ @cache_ttl_secs = value&.to_i
1274
1289
  else
1275
1290
  raise ArgumentError, "Unknown configuration key: #{key}"
1276
1291
  end
@@ -1352,6 +1367,8 @@ module Kreuzberg
1352
1367
  @max_concurrent_extractions = merged.max_concurrent_extractions
1353
1368
  @output_format = merged.output_format
1354
1369
  @result_format = merged.result_format
1370
+ @cache_namespace = merged.cache_namespace
1371
+ @cache_ttl_secs = merged.cache_ttl_secs
1355
1372
  end
1356
1373
  end
1357
1374
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.5.1'
4
+ VERSION = '4.5.4'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -463,8 +463,9 @@ module Kreuzberg
463
463
  attr_reader preset: String
464
464
  attr_reader confidence_threshold: Float?
465
465
  attr_reader apply_heuristics: bool
466
+ attr_reader table_model: String?
466
467
 
467
- def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool) -> void
468
+ def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool, ?table_model: String?) -> void
468
469
  def to_h: () -> Hash[Symbol, untyped]
469
470
  end
470
471
 
@@ -478,6 +479,8 @@ module Kreuzberg
478
479
  class Extraction
479
480
  attr_reader use_cache: bool
480
481
  attr_reader enable_quality_processing: bool
482
+ attr_reader cache_namespace: String?
483
+ attr_reader cache_ttl_secs: Integer?
481
484
  attr_reader force_ocr: bool
482
485
  attr_reader include_document_structure: bool
483
486
  attr_reader ocr: OCR?
@@ -520,7 +523,9 @@ module Kreuzberg
520
523
  ?concurrency: (Concurrency | Hash[Symbol, untyped])?,
521
524
  ?max_concurrent_extractions: Integer?,
522
525
  ?output_format: String?,
523
- ?result_format: String?
526
+ ?result_format: String?,
527
+ ?cache_namespace: String?,
528
+ ?cache_ttl_secs: Integer?
524
529
  ) -> void
525
530
  def to_h: () -> Hash[Symbol, untyped]
526
531
  def to_json: (*untyped) -> String
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.5.1"
5
+ version = "4.5.4"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -15,7 +15,9 @@ ahash = "0.8.12"
15
15
  anyhow = "1.0"
16
16
  async-trait = "0.1.89"
17
17
  base64 = "0.22.1"
18
+ blake3 = "1"
18
19
  bytes = { version = "1", features = ["serde"] }
20
+ cfb = "0.14"
19
21
  chrono = "0.4"
20
22
  clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
21
23
  console_error_panic_hook = "0.1"
@@ -24,13 +26,12 @@ ctor = "0.6"
24
26
  dbase = "0.7"
25
27
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
26
28
  hex = "0.4.3"
27
- html-to-markdown-rs = { version = "2.28.6", default-features = false }
28
- hwpers = "0.5"
29
+ html-to-markdown-rs = { version = "2.29.0", default-features = false }
29
30
  image = { version = "0.25.10", default-features = false }
30
31
  itertools = "0.14"
31
32
  js-sys = "0.3"
32
- kreuzberg = { path = "./crates/kreuzberg", version = "4.5.1", default-features = false }
33
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.1" }
33
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.5.4", default-features = false }
34
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.4" }
34
35
  lazy_static = "1.5.0"
35
36
  libc = "0.2.183"
36
37
  log = "0.4"