kreuzberg 4.5.2 → 4.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +4 -4
  4. data/ext/kreuzberg_rb/native/Cargo.lock +62 -46
  5. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +23 -9
  7. data/lib/kreuzberg/config.rb +5 -3
  8. data/lib/kreuzberg/version.rb +1 -1
  9. data/sig/kreuzberg.rbs +2 -1
  10. data/vendor/Cargo.toml +5 -5
  11. data/vendor/kreuzberg/Cargo.toml +144 -112
  12. data/vendor/kreuzberg/README.md +1 -1
  13. data/vendor/kreuzberg/src/chunking/core.rs +47 -0
  14. data/vendor/kreuzberg/src/core/config/layout.rs +12 -0
  15. data/vendor/kreuzberg/src/core/config/processing.rs +19 -0
  16. data/vendor/kreuzberg/src/core/mime.rs +42 -1
  17. data/vendor/kreuzberg/src/extraction/hwp/error.rs +54 -0
  18. data/vendor/kreuzberg/src/extraction/hwp/mod.rs +72 -0
  19. data/vendor/kreuzberg/src/extraction/hwp/model.rs +102 -0
  20. data/vendor/kreuzberg/src/extraction/hwp/parser.rs +174 -0
  21. data/vendor/kreuzberg/src/extraction/hwp/reader.rs +126 -0
  22. data/vendor/kreuzberg/src/extraction/mod.rs +3 -0
  23. data/vendor/kreuzberg/src/extractors/epub/content.rs +58 -7
  24. data/vendor/kreuzberg/src/extractors/epub/mod.rs +19 -5
  25. data/vendor/kreuzberg/src/extractors/hwp.rs +4 -5
  26. data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +189 -0
  27. data/vendor/kreuzberg/src/extractors/iwork/mod.rs +291 -0
  28. data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +186 -0
  29. data/vendor/kreuzberg/src/extractors/iwork/pages.rs +182 -0
  30. data/vendor/kreuzberg/src/extractors/mod.rs +13 -0
  31. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +4 -0
  32. data/vendor/kreuzberg/src/layout/mod.rs +133 -0
  33. data/vendor/kreuzberg/src/layout/model_manager.rs +61 -2
  34. data/vendor/kreuzberg/src/layout/models/mod.rs +2 -0
  35. data/vendor/kreuzberg/src/layout/models/slanet.rs +550 -0
  36. data/vendor/kreuzberg/src/layout/models/table_classifier.rs +219 -0
  37. data/vendor/kreuzberg/src/pdf/images.rs +13 -0
  38. data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +244 -65
  39. data/vendor/kreuzberg/src/pdf/markdown/regions/mod.rs +2 -0
  40. data/vendor/kreuzberg/src/pdf/markdown/regions/table_recognition.rs +334 -1
  41. data/vendor/kreuzberg/tests/epub_markdown_headings_tests.rs +177 -0
  42. data/vendor/kreuzberg/tests/iwork_integration.rs +220 -0
  43. data/vendor/kreuzberg-ffi/Cargo.toml +14 -14
  44. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  45. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +14 -14
  46. data/vendor/kreuzberg-pdfium-render/Cargo.toml +17 -17
  47. data/vendor/kreuzberg-tesseract/Cargo.toml +27 -27
  48. metadata +15 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2f29e7c9b7614fc78e0c54f673a804f79625081faa79317da54647937fe51a46
4
- data.tar.gz: b465d7be3c677c7a7a87eb888503f57b7cf42e5bac353418a191cc2629ad3d5c
3
+ metadata.gz: 47a14cc623891453596552fd893b43f18ffa04068de61b47b34b7f18ad8af890
4
+ data.tar.gz: 5e18a52f5acbabba2ee64790b3831c89301e58043c8b22ba7616791a2338401e
5
5
  SHA512:
6
- metadata.gz: fc25d857d8252f4759ed2ea07003107843182c87d855872da228f599371cdb9f705d2883995bc17ac6dd2fadf12d6aa2023eb1abf6f69f5e2844b1a90473cb02
7
- data.tar.gz: 5fe146eebe572f4a6b5ac89d9e187b97eb72787493b4748ba66c968014cbc7757b0ee6bec64516968ff7419a0c6c4c5b57e0b88240a61cde454ac72fa0fed9e7
6
+ metadata.gz: 9ac2251c79bcdff41d8746ea3244bf3774cd022aa0f9c63a0153a7ec394f86d1bfd4fe3b364d175371f602006aafea17f612250b2b8368e213989b910f17940e
7
+ data.tar.gz: 8494619ec2253eaeb68b95fc4e204355ab2a08f71f5863f88b8305c7c07333a456f73c497804de7ca7693b1207bd0afaef3e6ca0ab5cb811eee1686823bff31d
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.5.2)
4
+ kreuzberg (4.5.4)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -222,7 +222,7 @@ CHECKSUMS
222
222
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
223
223
  json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
224
224
  json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
225
- kreuzberg (4.5.2)
225
+ kreuzberg (4.5.4)
226
226
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
227
227
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
228
228
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.4" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -61,7 +61,7 @@
61
61
  </div>
62
62
 
63
63
 
64
- Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
64
+ Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
65
65
 
66
66
 
67
67
  ## Installation
@@ -211,9 +211,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
211
211
 
212
212
  ## Features
213
213
 
214
- ### Supported File Formats (88+)
214
+ ### Supported File Formats (91+)
215
215
 
216
- 88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
216
+ 91+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
217
217
 
218
218
  #### Office Documents
219
219
 
@@ -1215,15 +1215,15 @@ checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2"
1215
1215
 
1216
1216
  [[package]]
1217
1217
  name = "deflate64"
1218
- version = "0.1.11"
1218
+ version = "0.1.12"
1219
1219
  source = "registry+https://github.com/rust-lang/crates.io-index"
1220
- checksum = "807800ff3288b621186fe0a8f3392c4652068257302709c24efd918c3dffcdc2"
1220
+ checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
1221
1221
 
1222
1222
  [[package]]
1223
1223
  name = "der"
1224
- version = "0.7.10"
1224
+ version = "0.8.0"
1225
1225
  source = "registry+https://github.com/rust-lang/crates.io-index"
1226
- checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
1226
+ checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b"
1227
1227
  dependencies = [
1228
1228
  "pem-rfc7468",
1229
1229
  "zeroize",
@@ -2027,7 +2027,7 @@ dependencies = [
2027
2027
  "serde",
2028
2028
  "serde_json",
2029
2029
  "thiserror 2.0.18",
2030
- "ureq 3.2.1",
2030
+ "ureq 3.3.0",
2031
2031
  "windows-sys 0.61.2",
2032
2032
  ]
2033
2033
 
@@ -2057,9 +2057,9 @@ dependencies = [
2057
2057
 
2058
2058
  [[package]]
2059
2059
  name = "html-to-markdown-rs"
2060
- version = "2.28.6"
2060
+ version = "2.29.0"
2061
2061
  source = "registry+https://github.com/rust-lang/crates.io-index"
2062
- checksum = "6869b5e058b5ebb8c176269406b692d0695b4b19c36e532b56a2c355590978ae"
2062
+ checksum = "9013679b8c3600142e5a8f742748c3c38c49d9fc50675dad62f8f1721090a85a"
2063
2063
  dependencies = [
2064
2064
  "ahash",
2065
2065
  "astral-tl",
@@ -2669,9 +2669,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
2669
2669
 
2670
2670
  [[package]]
2671
2671
  name = "iri-string"
2672
- version = "0.7.10"
2672
+ version = "0.7.11"
2673
2673
  source = "registry+https://github.com/rust-lang/crates.io-index"
2674
- checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a"
2674
+ checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb"
2675
2675
  dependencies = [
2676
2676
  "memchr",
2677
2677
  "serde",
@@ -2757,7 +2757,7 @@ dependencies = [
2757
2757
  "cesu8",
2758
2758
  "cfg-if",
2759
2759
  "combine",
2760
- "jni-sys",
2760
+ "jni-sys 0.3.1",
2761
2761
  "log",
2762
2762
  "thiserror 1.0.69",
2763
2763
  "walkdir",
@@ -2766,9 +2766,31 @@ dependencies = [
2766
2766
 
2767
2767
  [[package]]
2768
2768
  name = "jni-sys"
2769
- version = "0.3.0"
2769
+ version = "0.3.1"
2770
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2771
+ checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258"
2772
+ dependencies = [
2773
+ "jni-sys 0.4.1",
2774
+ ]
2775
+
2776
+ [[package]]
2777
+ name = "jni-sys"
2778
+ version = "0.4.1"
2779
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2780
+ checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2"
2781
+ dependencies = [
2782
+ "jni-sys-macros",
2783
+ ]
2784
+
2785
+ [[package]]
2786
+ name = "jni-sys-macros"
2787
+ version = "0.4.1"
2770
2788
  source = "registry+https://github.com/rust-lang/crates.io-index"
2771
- checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
2789
+ checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264"
2790
+ dependencies = [
2791
+ "quote",
2792
+ "syn",
2793
+ ]
2772
2794
 
2773
2795
  [[package]]
2774
2796
  name = "jobserver"
@@ -2807,7 +2829,7 @@ dependencies = [
2807
2829
 
2808
2830
  [[package]]
2809
2831
  name = "kreuzberg"
2810
- version = "4.5.1"
2832
+ version = "4.5.3"
2811
2833
  dependencies = [
2812
2834
  "ahash",
2813
2835
  "async-trait",
@@ -2870,6 +2892,7 @@ dependencies = [
2870
2892
  "serde_yaml_ng",
2871
2893
  "sevenz-rust2",
2872
2894
  "sha2",
2895
+ "snap",
2873
2896
  "tar",
2874
2897
  "text-splitter",
2875
2898
  "thiserror 2.0.18",
@@ -2882,16 +2905,16 @@ dependencies = [
2882
2905
  "tracing",
2883
2906
  "tracing-opentelemetry",
2884
2907
  "unicode-normalization",
2885
- "ureq 3.2.1",
2908
+ "ureq 3.3.0",
2886
2909
  "utoipa",
2887
2910
  "whatlang",
2888
2911
  "yake-rust",
2889
- "zip 8.3.0",
2912
+ "zip 7.2.0",
2890
2913
  ]
2891
2914
 
2892
2915
  [[package]]
2893
2916
  name = "kreuzberg-ffi"
2894
- version = "4.5.1"
2917
+ version = "4.5.3"
2895
2918
  dependencies = [
2896
2919
  "ahash",
2897
2920
  "async-trait",
@@ -2907,7 +2930,7 @@ dependencies = [
2907
2930
 
2908
2931
  [[package]]
2909
2932
  name = "kreuzberg-paddle-ocr"
2910
- version = "4.5.1"
2933
+ version = "4.5.3"
2911
2934
  dependencies = [
2912
2935
  "geo-clipper",
2913
2936
  "geo-types",
@@ -2921,7 +2944,7 @@ dependencies = [
2921
2944
 
2922
2945
  [[package]]
2923
2946
  name = "kreuzberg-pdfium-render"
2924
- version = "4.5.1"
2947
+ version = "4.5.3"
2925
2948
  dependencies = [
2926
2949
  "bitflags",
2927
2950
  "bytemuck",
@@ -2944,7 +2967,7 @@ dependencies = [
2944
2967
 
2945
2968
  [[package]]
2946
2969
  name = "kreuzberg-rb"
2947
- version = "4.5.1"
2970
+ version = "4.5.3"
2948
2971
  dependencies = [
2949
2972
  "async-trait",
2950
2973
  "html-to-markdown-rs",
@@ -2961,13 +2984,13 @@ dependencies = [
2961
2984
 
2962
2985
  [[package]]
2963
2986
  name = "kreuzberg-tesseract"
2964
- version = "4.5.1"
2987
+ version = "4.5.3"
2965
2988
  dependencies = [
2966
2989
  "cc",
2967
2990
  "cmake",
2968
2991
  "reqwest",
2969
2992
  "thiserror 2.0.18",
2970
- "zip 8.3.0",
2993
+ "zip 7.2.0",
2971
2994
  ]
2972
2995
 
2973
2996
  [[package]]
@@ -3739,7 +3762,7 @@ dependencies = [
3739
3762
  "ort-sys",
3740
3763
  "smallvec",
3741
3764
  "tracing",
3742
- "ureq 3.2.1",
3765
+ "ureq 3.3.0",
3743
3766
  ]
3744
3767
 
3745
3768
  [[package]]
@@ -3750,7 +3773,7 @@ checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90"
3750
3773
  dependencies = [
3751
3774
  "hmac-sha256",
3752
3775
  "lzma-rust2 0.15.7",
3753
- "ureq 3.2.1",
3776
+ "ureq 3.3.0",
3754
3777
  ]
3755
3778
 
3756
3779
  [[package]]
@@ -3806,9 +3829,9 @@ dependencies = [
3806
3829
 
3807
3830
  [[package]]
3808
3831
  name = "pem-rfc7468"
3809
- version = "0.7.0"
3832
+ version = "1.0.0"
3810
3833
  source = "registry+https://github.com/rust-lang/crates.io-index"
3811
- checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
3834
+ checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9"
3812
3835
  dependencies = [
3813
3836
  "base64ct",
3814
3837
  ]
@@ -4004,9 +4027,9 @@ dependencies = [
4004
4027
 
4005
4028
  [[package]]
4006
4029
  name = "pulldown-cmark"
4007
- version = "0.13.2"
4030
+ version = "0.13.3"
4008
4031
  source = "registry+https://github.com/rust-lang/crates.io-index"
4009
- checksum = "14104c5a24d9bcf7eb2c24753e0f49fe14555d8bd565ea3d38e4b4303267259d"
4032
+ checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad"
4010
4033
  dependencies = [
4011
4034
  "bitflags",
4012
4035
  "getopts",
@@ -4979,6 +5002,12 @@ version = "1.15.1"
4979
5002
  source = "registry+https://github.com/rust-lang/crates.io-index"
4980
5003
  checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
4981
5004
 
5005
+ [[package]]
5006
+ name = "snap"
5007
+ version = "1.1.1"
5008
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5009
+ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
5010
+
4982
5011
  [[package]]
4983
5012
  name = "socket2"
4984
5013
  version = "0.6.3"
@@ -5772,9 +5801,9 @@ dependencies = [
5772
5801
 
5773
5802
  [[package]]
5774
5803
  name = "ureq"
5775
- version = "3.2.1"
5804
+ version = "3.3.0"
5776
5805
  source = "registry+https://github.com/rust-lang/crates.io-index"
5777
- checksum = "4ab5172ab0c2b6d01a9bb4f9332f7c1211193ea002742188040d09ea4eafe867"
5806
+ checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
5778
5807
  dependencies = [
5779
5808
  "base64 0.22.1",
5780
5809
  "cookie_store",
@@ -5796,9 +5825,9 @@ dependencies = [
5796
5825
 
5797
5826
  [[package]]
5798
5827
  name = "ureq-proto"
5799
- version = "0.5.3"
5828
+ version = "0.6.0"
5800
5829
  source = "registry+https://github.com/rust-lang/crates.io-index"
5801
- checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f"
5830
+ checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
5802
5831
  dependencies = [
5803
5832
  "base64 0.22.1",
5804
5833
  "http",
@@ -6846,19 +6875,6 @@ dependencies = [
6846
6875
  "zopfli",
6847
6876
  ]
6848
6877
 
6849
- [[package]]
6850
- name = "zip"
6851
- version = "8.3.0"
6852
- source = "registry+https://github.com/rust-lang/crates.io-index"
6853
- checksum = "4a243cfad17427fc077f529da5a95abe4e94fd2bfdb601611870a6557cc67657"
6854
- dependencies = [
6855
- "crc32fast",
6856
- "flate2",
6857
- "indexmap",
6858
- "memchr",
6859
- "typed-path",
6860
- ]
6861
-
6862
6878
  [[package]]
6863
6879
  name = "zlib-rs"
6864
6880
  version = "0.6.3"
@@ -6928,9 +6944,9 @@ dependencies = [
6928
6944
 
6929
6945
  [[package]]
6930
6946
  name = "zune-jpeg"
6931
- version = "0.5.13"
6947
+ version = "0.5.14"
6932
6948
  source = "registry+https://github.com/rust-lang/crates.io-index"
6933
- checksum = "ec5f41c76397b7da451efd19915684f727d7e1d516384ca6bd0ec43ec94de23c"
6949
+ checksum = "0b7a1c0af6e5d8d1363f4994b7a091ccf963d8b694f7da5b0b9cceb82da2c0a6"
6934
6950
  dependencies = [
6935
6951
  "zune-core",
6936
6952
  ]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.5.2"
3
+ version = "4.5.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -64,7 +64,7 @@ tokio = { version = "1.50.0", features = [
64
64
  "time",
65
65
  "io-util",
66
66
  ] }
67
- html-to-markdown-rs = { version = "2.28.6", default-features = false }
67
+ html-to-markdown-rs = { version = "2.29.0", default-features = false }
68
68
 
69
69
  [dev-dependencies]
70
70
  pretty_assertions = "1.4"
@@ -139,6 +139,12 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
139
139
 
140
140
  let sizing = parse_chunk_sizing(ruby, hash)?;
141
141
 
142
+ let prepend_heading_context = if let Some(val) = get_kw(ruby, hash, "prepend_heading_context") {
143
+ bool::try_convert(val)?
144
+ } else {
145
+ false
146
+ };
147
+
142
148
  let config = ChunkingConfig {
143
149
  max_characters: max_chars,
144
150
  overlap: max_overlap,
@@ -147,6 +153,7 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
147
153
  embedding,
148
154
  preset,
149
155
  sizing,
156
+ prepend_heading_context,
150
157
  };
151
158
 
152
159
  Ok(config)
@@ -315,12 +322,11 @@ pub fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
315
322
  None
316
323
  };
317
324
 
318
- let allow_single_column_tables =
319
- if let Some(val) = get_kw(ruby, hash, "allow_single_column_tables") {
320
- bool::try_convert(val)?
321
- } else {
322
- false
323
- };
325
+ let allow_single_column_tables = if let Some(val) = get_kw(ruby, hash, "allow_single_column_tables") {
326
+ bool::try_convert(val)?
327
+ } else {
328
+ false
329
+ };
324
330
 
325
331
  let config = PdfConfig {
326
332
  extract_images,
@@ -819,10 +825,19 @@ pub fn parse_layout_detection_config(ruby: &Ruby, hash: RHash) -> Result<LayoutD
819
825
  true
820
826
  };
821
827
 
828
+ let table_model = if let Some(val) = get_kw(ruby, hash, "table_model")
829
+ && val.equal(ruby.qnil()).ok() != Some(true)
830
+ {
831
+ Some(String::try_convert(val)?)
832
+ } else {
833
+ None
834
+ };
835
+
822
836
  let config = LayoutDetectionConfig {
823
837
  preset,
824
838
  confidence_threshold,
825
839
  apply_heuristics,
840
+ table_model,
826
841
  };
827
842
 
828
843
  Ok(config)
@@ -952,9 +967,8 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
952
967
  && val.equal(ruby.qnil()).ok() != Some(true)
953
968
  {
954
969
  let security_json = ruby_value_to_json(val)?;
955
- let parsed: kreuzberg::extractors::security::SecurityLimits =
956
- serde_json::from_value(security_json)
957
- .map_err(|e| runtime_error(format!("Invalid security_limits: {}", e)))?;
970
+ let parsed: kreuzberg::extractors::security::SecurityLimits = serde_json::from_value(security_json)
971
+ .map_err(|e| runtime_error(format!("Invalid security_limits: {}", e)))?;
958
972
  config.security_limits = Some(parsed);
959
973
  }
960
974
 
@@ -850,19 +850,21 @@ module Kreuzberg
850
850
  # )
851
851
  #
852
852
  class LayoutDetection
853
- attr_reader :preset, :confidence_threshold, :apply_heuristics
853
+ attr_reader :preset, :confidence_threshold, :apply_heuristics, :table_model
854
854
 
855
- def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true)
855
+ def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true, table_model: nil)
856
856
  @preset = preset.to_s
857
857
  @confidence_threshold = confidence_threshold&.to_f
858
858
  @apply_heuristics = apply_heuristics ? true : false
859
+ @table_model = table_model&.to_s
859
860
  end
860
861
 
861
862
  def to_h
862
863
  {
863
864
  preset: @preset,
864
865
  confidence_threshold: @confidence_threshold,
865
- apply_heuristics: @apply_heuristics
866
+ apply_heuristics: @apply_heuristics,
867
+ table_model: @table_model
866
868
  }.compact
867
869
  end
868
870
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.5.2'
4
+ VERSION = '4.5.4'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -463,8 +463,9 @@ module Kreuzberg
463
463
  attr_reader preset: String
464
464
  attr_reader confidence_threshold: Float?
465
465
  attr_reader apply_heuristics: bool
466
+ attr_reader table_model: String?
466
467
 
467
- def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool) -> void
468
+ def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool, ?table_model: String?) -> void
468
469
  def to_h: () -> Hash[Symbol, untyped]
469
470
  end
470
471
 
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.5.2"
5
+ version = "4.5.4"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -17,6 +17,7 @@ async-trait = "0.1.89"
17
17
  base64 = "0.22.1"
18
18
  blake3 = "1"
19
19
  bytes = { version = "1", features = ["serde"] }
20
+ cfb = "0.14"
20
21
  chrono = "0.4"
21
22
  clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
22
23
  console_error_panic_hook = "0.1"
@@ -25,13 +26,12 @@ ctor = "0.6"
25
26
  dbase = "0.7"
26
27
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
27
28
  hex = "0.4.3"
28
- html-to-markdown-rs = { version = "2.28.6", default-features = false }
29
- hwpers = "0.5"
29
+ html-to-markdown-rs = { version = "2.29.0", default-features = false }
30
30
  image = { version = "0.25.10", default-features = false }
31
31
  itertools = "0.14"
32
32
  js-sys = "0.3"
33
- kreuzberg = { path = "./crates/kreuzberg", version = "4.5.2", default-features = false }
34
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.2" }
33
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.5.4", default-features = false }
34
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.4" }
35
35
  lazy_static = "1.5.0"
36
36
  libc = "0.2.183"
37
37
  log = "0.4"