kreuzberg 4.3.6 → 4.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +65 -28
  5. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +29 -0
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
  8. data/ext/kreuzberg_rb/native/src/result.rs +33 -0
  9. data/lib/kreuzberg/config.rb +13 -3
  10. data/lib/kreuzberg/result.rb +32 -2
  11. data/lib/kreuzberg/types.rb +20 -0
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/sig/kreuzberg.rbs +28 -2
  14. data/vendor/Cargo.toml +2 -2
  15. data/vendor/kreuzberg/Cargo.toml +8 -8
  16. data/vendor/kreuzberg/README.md +1 -1
  17. data/vendor/kreuzberg/src/chunking/processor.rs +5 -0
  18. data/vendor/kreuzberg/src/core/config/pdf.rs +32 -0
  19. data/vendor/kreuzberg/src/core/config_validation/mod.rs +1 -1
  20. data/vendor/kreuzberg/src/core/extractor/batch.rs +2 -0
  21. data/vendor/kreuzberg/src/core/extractor/sync.rs +1 -0
  22. data/vendor/kreuzberg/src/core/pipeline/mod.rs +26 -0
  23. data/vendor/kreuzberg/src/core/pipeline/tests.rs +94 -0
  24. data/vendor/kreuzberg/src/extraction/image_ocr.rs +1 -0
  25. data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +1 -0
  26. data/vendor/kreuzberg/src/extraction/transform/mod.rs +4 -0
  27. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  28. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/citation.rs +2 -0
  30. data/vendor/kreuzberg/src/extractors/csv.rs +1 -0
  31. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +2 -0
  32. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +1 -0
  33. data/vendor/kreuzberg/src/extractors/doc.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -0
  35. data/vendor/kreuzberg/src/extractors/docx.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/email.rs +1 -0
  37. data/vendor/kreuzberg/src/extractors/epub/mod.rs +1 -0
  38. data/vendor/kreuzberg/src/extractors/excel.rs +21 -8
  39. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/html.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/image.rs +2 -0
  42. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -0
  43. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  44. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/odt.rs +1 -0
  47. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -0
  48. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +297 -4
  50. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +7 -0
  51. data/vendor/kreuzberg/src/extractors/ppt.rs +1 -0
  52. data/vendor/kreuzberg/src/extractors/pptx.rs +2 -0
  53. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  54. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -0
  55. data/vendor/kreuzberg/src/extractors/structured.rs +1 -0
  56. data/vendor/kreuzberg/src/extractors/text.rs +2 -0
  57. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  58. data/vendor/kreuzberg/src/extractors/xml.rs +1 -0
  59. data/vendor/kreuzberg/src/keywords/processor.rs +7 -0
  60. data/vendor/kreuzberg/src/language_detection/processor.rs +5 -0
  61. data/vendor/kreuzberg/src/mcp/format.rs +4 -0
  62. data/vendor/kreuzberg/src/mcp/params.rs +20 -0
  63. data/vendor/kreuzberg/src/mcp/server.rs +8 -2
  64. data/vendor/kreuzberg/src/mcp/tools/cache.rs +8 -8
  65. data/vendor/kreuzberg/src/ocr/table/mod.rs +26 -5
  66. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  67. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +1 -0
  68. data/vendor/kreuzberg/src/pdf/annotations.rs +177 -0
  69. data/vendor/kreuzberg/src/pdf/markdown/assembly.rs +5 -14
  70. data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +420 -51
  71. data/vendor/kreuzberg/src/pdf/markdown/classify.rs +1 -9
  72. data/vendor/kreuzberg/src/pdf/markdown/columns.rs +182 -0
  73. data/vendor/kreuzberg/src/pdf/markdown/constants.rs +2 -2
  74. data/vendor/kreuzberg/src/pdf/markdown/lines.rs +10 -25
  75. data/vendor/kreuzberg/src/pdf/markdown/mod.rs +2 -1
  76. data/vendor/kreuzberg/src/pdf/markdown/paragraphs.rs +2 -21
  77. data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +56 -20
  78. data/vendor/kreuzberg/src/pdf/markdown/render.rs +0 -27
  79. data/vendor/kreuzberg/src/pdf/markdown/types.rs +0 -7
  80. data/vendor/kreuzberg/src/pdf/mod.rs +4 -0
  81. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +2 -0
  82. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +3 -0
  83. data/vendor/kreuzberg/src/plugins/ocr.rs +1 -0
  84. data/vendor/kreuzberg/src/plugins/processor/mod.rs +8 -0
  85. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +2 -0
  86. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +2 -0
  87. data/vendor/kreuzberg/src/plugins/validator/mod.rs +11 -0
  88. data/vendor/kreuzberg/src/text/quality_processor.rs +5 -0
  89. data/vendor/kreuzberg/src/types/annotations.rs +41 -0
  90. data/vendor/kreuzberg/src/types/extraction.rs +9 -0
  91. data/vendor/kreuzberg/src/types/mod.rs +2 -0
  92. data/vendor/kreuzberg/tests/dump_pdf_markdown.rs +33 -0
  93. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +12 -0
  94. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +3 -0
  95. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +40 -1
  96. data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
  97. data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
  98. data/vendor/kreuzberg-ffi/src/helpers.rs +17 -0
  99. data/vendor/kreuzberg-ffi/src/lib.rs +7 -4
  100. data/vendor/kreuzberg-ffi/src/memory.rs +9 -1
  101. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -0
  102. data/vendor/kreuzberg-ffi/src/result.rs +1 -0
  103. data/vendor/kreuzberg-ffi/src/result_view.rs +2 -0
  104. data/vendor/kreuzberg-ffi/src/types.rs +8 -5
  105. data/vendor/kreuzberg-ffi/src/validation.rs +1 -1
  106. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  107. data/vendor/kreuzberg-paddle-ocr/tests/diagnostic.rs +1 -1
  108. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  109. data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/paragraph.rs +78 -142
  110. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  111. metadata +5 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1b3eb519a94cf2a82e9d9b649ce98e122fec14d01568ced3edc925f1cb49f4ad
4
- data.tar.gz: cdfa987af6f7bc8b0a6defb76b3426a616ed9d9451bd3aa27680eae7eba5325c
3
+ metadata.gz: 1c9f9cd70dd541fd9c193c1ab60782eb04e287019f707eeb0d6eb853f64fc039
4
+ data.tar.gz: ce8345437d8e47062a21799b0352ceb2d5917a0546421b39a928fe1b659dc28d
5
5
  SHA512:
6
- metadata.gz: 1c6170355aa3f4443b68aed401e2d7b8f20a4b792c18e130b5f59be9ea3102527bb9098f77f30d1383287d404846ca99a397807827284e7b0c774df58d85cd51
7
- data.tar.gz: 5362d11257dd57715e8f9e0743a3f5e8fba64c9177f4858b587fd7230dbae2c54fac194d0aa64da7db1f970ad8eaf605683d6547cdd39c81b55d0bebe43502d5
6
+ metadata.gz: 012fac8575af2561d8b114649ef370ee281e90629a3ac95abad398dc3371853109e76e19b4aca9aeeba315853cae72f2ff9788681bd7b1bcfa3cb8245dbaa939
7
+ data.tar.gz: fce6aa4f32a4651821d20ba61c77d6b61fc1038492e4b7d960c585ce2c131dd228430835fcb35f945998c64ae3faa71655ab5748b6243e2c70453e9505a442a0
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.3.6)
4
+ kreuzberg (4.3.7)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -124,7 +124,7 @@ GEM
124
124
  rubocop (~> 1.81)
125
125
  ruby-progressbar (1.13.0)
126
126
  securerandom (0.4.1)
127
- sorbet-runtime (0.6.12942)
127
+ sorbet-runtime (0.6.12956)
128
128
  steep (1.10.0)
129
129
  activesupport (>= 5.1)
130
130
  concurrent-ruby (>= 1.1.10)
@@ -210,7 +210,7 @@ CHECKSUMS
210
210
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
211
211
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
212
212
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
213
- kreuzberg (4.3.6)
213
+ kreuzberg (4.3.7)
214
214
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
215
215
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
216
216
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -245,7 +245,7 @@ CHECKSUMS
245
245
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
246
246
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
247
247
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
248
- sorbet-runtime (0.6.12942) sha256=967bda04814d234e4239c4f883c1d0ee6de3e47bf8bafd2c0cc30d18df2ddd3a
248
+ sorbet-runtime (0.6.12956) sha256=fee716a62d0b1d94ebc8e6ba23e76a7654eeac66c1f5cc1e1bef78b8e9ff87c7
249
249
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
250
250
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
251
251
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.6" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.7" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -16,7 +16,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
16
16
  dependencies = [
17
17
  "cfg-if",
18
18
  "cipher",
19
- "cpufeatures",
19
+ "cpufeatures 0.2.17",
20
20
  ]
21
21
 
22
22
  [[package]]
@@ -795,6 +795,17 @@ version = "0.2.1"
795
795
  source = "registry+https://github.com/rust-lang/crates.io-index"
796
796
  checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
797
797
 
798
+ [[package]]
799
+ name = "chacha20"
800
+ version = "0.10.0"
801
+ source = "registry+https://github.com/rust-lang/crates.io-index"
802
+ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
803
+ dependencies = [
804
+ "cfg-if",
805
+ "cpufeatures 0.3.0",
806
+ "rand_core 0.10.0",
807
+ ]
808
+
798
809
  [[package]]
799
810
  name = "chardetng"
800
811
  version = "0.1.17"
@@ -1099,6 +1110,15 @@ dependencies = [
1099
1110
  "libc",
1100
1111
  ]
1101
1112
 
1113
+ [[package]]
1114
+ name = "cpufeatures"
1115
+ version = "0.3.0"
1116
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1117
+ checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
1118
+ dependencies = [
1119
+ "libc",
1120
+ ]
1121
+
1102
1122
  [[package]]
1103
1123
  name = "crc"
1104
1124
  version = "3.3.0"
@@ -1632,9 +1652,9 @@ dependencies = [
1632
1652
 
1633
1653
  [[package]]
1634
1654
  name = "fastembed"
1635
- version = "5.9.0"
1655
+ version = "5.11.0"
1636
1656
  source = "registry+https://github.com/rust-lang/crates.io-index"
1637
- checksum = "f6f8b48452e32595e2fcb89d0c582490681ef4febca0ae037fc75a122525953a"
1657
+ checksum = "b4339d45a80579ab8305616a501eacdbf18fb0f7def7fa6e4c0b75941416d5b0"
1638
1658
  dependencies = [
1639
1659
  "anyhow",
1640
1660
  "hf-hub",
@@ -1961,6 +1981,7 @@ dependencies = [
1961
1981
  "js-sys",
1962
1982
  "libc",
1963
1983
  "r-efi",
1984
+ "rand_core 0.10.0",
1964
1985
  "wasip2",
1965
1986
  "wasip3",
1966
1987
  "wasm-bindgen",
@@ -2258,9 +2279,9 @@ dependencies = [
2258
2279
 
2259
2280
  [[package]]
2260
2281
  name = "html-to-markdown-rs"
2261
- version = "2.25.0"
2282
+ version = "2.25.1"
2262
2283
  source = "registry+https://github.com/rust-lang/crates.io-index"
2263
- checksum = "bb31d75f2fdbc8d889d78a912e10c22c30451afb44ee3310f5bfcabf79a31a17"
2284
+ checksum = "c05335c6bf406653110ad8447c84461c6d0cda5e0aff9d3d3518f87502d30abe"
2264
2285
  dependencies = [
2265
2286
  "ahash",
2266
2287
  "astral-tl",
@@ -2880,7 +2901,7 @@ dependencies = [
2880
2901
 
2881
2902
  [[package]]
2882
2903
  name = "kreuzberg"
2883
- version = "4.3.4"
2904
+ version = "4.3.6"
2884
2905
  dependencies = [
2885
2906
  "ahash",
2886
2907
  "async-trait",
@@ -2955,7 +2976,7 @@ dependencies = [
2955
2976
  "thiserror 2.0.18",
2956
2977
  "tiff 0.11.0",
2957
2978
  "tokio",
2958
- "toml 1.0.1+spec-1.1.0",
2979
+ "toml 1.0.3+spec-1.1.0",
2959
2980
  "tower",
2960
2981
  "tower-http",
2961
2982
  "tracing",
@@ -2967,12 +2988,12 @@ dependencies = [
2967
2988
  "uuid",
2968
2989
  "whatlang",
2969
2990
  "yake-rust",
2970
- "zip 8.0.0",
2991
+ "zip 8.1.0",
2971
2992
  ]
2972
2993
 
2973
2994
  [[package]]
2974
2995
  name = "kreuzberg-ffi"
2975
- version = "4.3.4"
2996
+ version = "4.3.6"
2976
2997
  dependencies = [
2977
2998
  "ahash",
2978
2999
  "async-trait",
@@ -2988,7 +3009,7 @@ dependencies = [
2988
3009
 
2989
3010
  [[package]]
2990
3011
  name = "kreuzberg-paddle-ocr"
2991
- version = "4.3.4"
3012
+ version = "4.3.6"
2992
3013
  dependencies = [
2993
3014
  "geo-clipper",
2994
3015
  "geo-types",
@@ -3003,7 +3024,7 @@ dependencies = [
3003
3024
 
3004
3025
  [[package]]
3005
3026
  name = "kreuzberg-pdfium-render"
3006
- version = "4.3.4"
3027
+ version = "4.3.6"
3007
3028
  dependencies = [
3008
3029
  "bitflags",
3009
3030
  "bytemuck",
@@ -3027,7 +3048,7 @@ dependencies = [
3027
3048
 
3028
3049
  [[package]]
3029
3050
  name = "kreuzberg-rb"
3030
- version = "4.3.4"
3051
+ version = "4.3.6"
3031
3052
  dependencies = [
3032
3053
  "async-trait",
3033
3054
  "html-to-markdown-rs",
@@ -3044,14 +3065,14 @@ dependencies = [
3044
3065
 
3045
3066
  [[package]]
3046
3067
  name = "kreuzberg-tesseract"
3047
- version = "4.3.4"
3068
+ version = "4.3.6"
3048
3069
  dependencies = [
3049
3070
  "cc",
3050
3071
  "cmake",
3051
3072
  "libc",
3052
3073
  "reqwest 0.13.2",
3053
3074
  "thiserror 2.0.18",
3054
- "zip 8.0.0",
3075
+ "zip 8.1.0",
3055
3076
  ]
3056
3077
 
3057
3078
  [[package]]
@@ -4985,6 +5006,17 @@ dependencies = [
4985
5006
  "rand_core 0.9.3",
4986
5007
  ]
4987
5008
 
5009
+ [[package]]
5010
+ name = "rand"
5011
+ version = "0.10.0"
5012
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5013
+ checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8"
5014
+ dependencies = [
5015
+ "chacha20",
5016
+ "getrandom 0.4.1",
5017
+ "rand_core 0.10.0",
5018
+ ]
5019
+
4988
5020
  [[package]]
4989
5021
  name = "rand_chacha"
4990
5022
  version = "0.9.0"
@@ -5010,6 +5042,12 @@ dependencies = [
5010
5042
  "getrandom 0.3.4",
5011
5043
  ]
5012
5044
 
5045
+ [[package]]
5046
+ name = "rand_core"
5047
+ version = "0.10.0"
5048
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5049
+ checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"
5050
+
5013
5051
  [[package]]
5014
5052
  name = "rand_distr"
5015
5053
  version = "0.5.1"
@@ -5362,12 +5400,11 @@ dependencies = [
5362
5400
 
5363
5401
  [[package]]
5364
5402
  name = "rmcp"
5365
- version = "0.15.0"
5403
+ version = "0.16.0"
5366
5404
  source = "registry+https://github.com/rust-lang/crates.io-index"
5367
- checksum = "1bef41ebc9ebed2c1b1d90203e9d1756091e8a00bbc3107676151f39868ca0ee"
5405
+ checksum = "cc4c9c94680f75470ee8083a0667988b5d7b5beb70b9f998a8e51de7c682ce60"
5368
5406
  dependencies = [
5369
5407
  "async-trait",
5370
- "axum",
5371
5408
  "base64 0.22.1",
5372
5409
  "bytes",
5373
5410
  "chrono",
@@ -5377,7 +5414,7 @@ dependencies = [
5377
5414
  "http-body-util",
5378
5415
  "pastey 0.2.1",
5379
5416
  "pin-project-lite",
5380
- "rand 0.9.2",
5417
+ "rand 0.10.0",
5381
5418
  "rmcp-macros",
5382
5419
  "schemars",
5383
5420
  "serde",
@@ -5394,9 +5431,9 @@ dependencies = [
5394
5431
 
5395
5432
  [[package]]
5396
5433
  name = "rmcp-macros"
5397
- version = "0.15.0"
5434
+ version = "0.16.0"
5398
5435
  source = "registry+https://github.com/rust-lang/crates.io-index"
5399
- checksum = "0e88ad84b8b6237a934534a62b379a5be6388915663c0cc598ceb9b3292bbbfe"
5436
+ checksum = "90c23c8f26cae4da838fbc3eadfaecf2d549d97c04b558e7bd90526a9c28b42a"
5400
5437
  dependencies = [
5401
5438
  "darling 0.23.0",
5402
5439
  "proc-macro2",
@@ -5861,7 +5898,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
5861
5898
  checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
5862
5899
  dependencies = [
5863
5900
  "cfg-if",
5864
- "cpufeatures",
5901
+ "cpufeatures 0.2.17",
5865
5902
  "digest",
5866
5903
  ]
5867
5904
 
@@ -5872,7 +5909,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
5872
5909
  checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
5873
5910
  dependencies = [
5874
5911
  "cfg-if",
5875
- "cpufeatures",
5912
+ "cpufeatures 0.2.17",
5876
5913
  "digest",
5877
5914
  ]
5878
5915
 
@@ -6584,9 +6621,9 @@ dependencies = [
6584
6621
 
6585
6622
  [[package]]
6586
6623
  name = "toml"
6587
- version = "1.0.1+spec-1.1.0"
6624
+ version = "1.0.3+spec-1.1.0"
6588
6625
  source = "registry+https://github.com/rust-lang/crates.io-index"
6589
- checksum = "bbe30f93627849fa362d4a602212d41bb237dc2bd0f8ba0b2ce785012e124220"
6626
+ checksum = "c7614eaf19ad818347db24addfa201729cf2a9b6fdfd9eb0ab870fcacc606c0c"
6590
6627
  dependencies = [
6591
6628
  "indexmap",
6592
6629
  "serde_core",
@@ -6640,9 +6677,9 @@ dependencies = [
6640
6677
 
6641
6678
  [[package]]
6642
6679
  name = "toml_parser"
6643
- version = "1.0.8+spec-1.1.0"
6680
+ version = "1.0.9+spec-1.1.0"
6644
6681
  source = "registry+https://github.com/rust-lang/crates.io-index"
6645
- checksum = "0742ff5ff03ea7e67c8ae6c93cac239e0d9784833362da3f9a9c1da8dfefcbdc"
6682
+ checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4"
6646
6683
  dependencies = [
6647
6684
  "winnow",
6648
6685
  ]
@@ -8072,9 +8109,9 @@ dependencies = [
8072
8109
 
8073
8110
  [[package]]
8074
8111
  name = "zip"
8075
- version = "8.0.0"
8112
+ version = "8.1.0"
8076
8113
  source = "registry+https://github.com/rust-lang/crates.io-index"
8077
- checksum = "79b32dd4ad3aca14ae109f8cce0495ac1c57f6f4f00ad459a40e582f89440d97"
8114
+ checksum = "6e499faf5c6b97a0d086f4a8733de6d47aee2252b8127962439d8d4311a73f72"
8078
8115
  dependencies = [
8079
8116
  "aes",
8080
8117
  "bzip2",
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.3.6"
40
+ version = "4.3.7"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -240,11 +240,40 @@ pub fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
240
240
  None
241
241
  };
242
242
 
243
+ let extract_annotations = if let Some(val) = get_kw(ruby, hash, "extract_annotations") {
244
+ bool::try_convert(val)?
245
+ } else {
246
+ false
247
+ };
248
+
249
+ let top_margin_fraction = if let Some(val) = get_kw(ruby, hash, "top_margin_fraction") {
250
+ if !val.is_nil() {
251
+ Some(f32::try_convert(val)?)
252
+ } else {
253
+ None
254
+ }
255
+ } else {
256
+ None
257
+ };
258
+
259
+ let bottom_margin_fraction = if let Some(val) = get_kw(ruby, hash, "bottom_margin_fraction") {
260
+ if !val.is_nil() {
261
+ Some(f32::try_convert(val)?)
262
+ } else {
263
+ None
264
+ }
265
+ } else {
266
+ None
267
+ };
268
+
243
269
  let config = PdfConfig {
244
270
  extract_images,
245
271
  passwords,
246
272
  extract_metadata,
247
273
  hierarchy,
274
+ extract_annotations,
275
+ top_margin_fraction,
276
+ bottom_margin_fraction,
248
277
  };
249
278
 
250
279
  Ok(config)
@@ -104,6 +104,7 @@ impl OcrBackend for RubyOcrBackend {
104
104
  extracted_keywords: None,
105
105
  quality_score: None,
106
106
  processing_warnings: vec![],
107
+ annotations: None,
107
108
  })
108
109
  })
109
110
  }
@@ -640,5 +640,38 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
640
640
  }
641
641
  set_hash_entry(ruby, &hash, "processing_warnings", warnings_array.into_value_with(ruby))?;
642
642
 
643
+ // Convert annotations
644
+ if let Some(annotations) = result.annotations {
645
+ let annotations_array = ruby.ary_new();
646
+ for annot in annotations {
647
+ let annot_hash = ruby.hash_new();
648
+ let type_str = serde_json::to_value(&annot.annotation_type)
649
+ .ok()
650
+ .and_then(|v| v.as_str().map(String::from))
651
+ .unwrap_or_default();
652
+ annot_hash.aset("annotation_type", type_str.as_str())?;
653
+ if let Some(content) = annot.content {
654
+ annot_hash.aset("content", content.as_str())?;
655
+ } else {
656
+ annot_hash.aset("content", ruby.qnil().as_value())?;
657
+ }
658
+ annot_hash.aset("page_number", annot.page_number as i64)?;
659
+ if let Some(bbox) = annot.bounding_box {
660
+ let bbox_hash = ruby.hash_new();
661
+ bbox_hash.aset("x0", bbox.x0)?;
662
+ bbox_hash.aset("y0", bbox.y0)?;
663
+ bbox_hash.aset("x1", bbox.x1)?;
664
+ bbox_hash.aset("y1", bbox.y1)?;
665
+ annot_hash.aset("bounding_box", bbox_hash)?;
666
+ } else {
667
+ annot_hash.aset("bounding_box", ruby.qnil().as_value())?;
668
+ }
669
+ annotations_array.push(annot_hash)?;
670
+ }
671
+ set_hash_entry(ruby, &hash, "annotations", annotations_array.into_value_with(ruby))?;
672
+ } else {
673
+ set_hash_entry(ruby, &hash, "annotations", ruby.qnil().as_value())?;
674
+ }
675
+
643
676
  Ok(hash)
644
677
  }
@@ -391,14 +391,18 @@ module Kreuzberg
391
391
  # pdf = PDF.new(extract_images: true, hierarchy: hierarchy)
392
392
  #
393
393
  class PDF
394
- attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy
394
+ attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy,
395
+ :extract_annotations, :top_margin_fraction, :bottom_margin_fraction
395
396
 
396
397
  def initialize(
397
398
  extract_images: false,
398
399
  passwords: nil,
399
400
  extract_metadata: true,
400
401
  font_config: nil,
401
- hierarchy: nil
402
+ hierarchy: nil,
403
+ extract_annotations: false,
404
+ top_margin_fraction: nil,
405
+ bottom_margin_fraction: nil
402
406
  )
403
407
  @extract_images = extract_images ? true : false
404
408
  @passwords = if passwords.is_a?(Array)
@@ -409,6 +413,9 @@ module Kreuzberg
409
413
  @extract_metadata = extract_metadata ? true : false
410
414
  @font_config = normalize_font_config(font_config)
411
415
  @hierarchy = normalize_hierarchy(hierarchy)
416
+ @extract_annotations = extract_annotations ? true : false
417
+ @top_margin_fraction = top_margin_fraction&.to_f
418
+ @bottom_margin_fraction = bottom_margin_fraction&.to_f
412
419
  end
413
420
 
414
421
  def to_h
@@ -417,7 +424,10 @@ module Kreuzberg
417
424
  passwords: @passwords,
418
425
  extract_metadata: @extract_metadata,
419
426
  font_config: @font_config&.to_h,
420
- hierarchy: @hierarchy&.to_h
427
+ hierarchy: @hierarchy&.to_h,
428
+ extract_annotations: @extract_annotations,
429
+ top_margin_fraction: @top_margin_fraction,
430
+ bottom_margin_fraction: @bottom_margin_fraction
421
431
  }.compact
422
432
  end
423
433
 
@@ -14,7 +14,7 @@ module Kreuzberg
14
14
  class Result
15
15
  attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
16
16
  :detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
17
- :document, :extracted_keywords, :quality_score, :processing_warnings
17
+ :document, :extracted_keywords, :quality_score, :processing_warnings, :annotations
18
18
 
19
19
  # @!attribute [r] cells
20
20
  # @return [Array<Array<String>>] Table cells (2D array)
@@ -339,6 +339,7 @@ module Kreuzberg
339
339
  @extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
340
340
  @quality_score = get_value(hash, 'quality_score')
341
341
  @processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
342
+ @annotations = parse_annotations(get_value(hash, 'annotations'))
342
343
  end
343
344
  # rubocop:enable Metrics/AbcSize
344
345
 
@@ -346,6 +347,7 @@ module Kreuzberg
346
347
  #
347
348
  # @return [Hash] Hash representation
348
349
  #
350
+ # rubocop:disable Metrics/CyclomaticComplexity
349
351
  def to_h
350
352
  {
351
353
  content: @content,
@@ -362,9 +364,11 @@ module Kreuzberg
362
364
  document: @document&.to_h,
363
365
  extracted_keywords: @extracted_keywords&.map(&:to_h),
364
366
  quality_score: @quality_score,
365
- processing_warnings: @processing_warnings.map(&:to_h)
367
+ processing_warnings: @processing_warnings.map(&:to_h),
368
+ annotations: @annotations&.map(&:to_h)
366
369
  }
367
370
  end
371
+ # rubocop:enable Metrics/CyclomaticComplexity
368
372
 
369
373
  # Convert to JSON
370
374
  #
@@ -707,6 +711,32 @@ module Kreuzberg
707
711
  )
708
712
  end
709
713
  end
714
+
715
+ def parse_annotations(annotations_data)
716
+ return nil if annotations_data.nil?
717
+
718
+ annotations_data.map { |a_hash| build_annotation(a_hash) }
719
+ end
720
+
721
+ def build_annotation(a_hash)
722
+ PdfAnnotation.new(
723
+ annotation_type: a_hash['annotation_type'] || '',
724
+ content: a_hash['content'],
725
+ page_number: a_hash['page_number']&.to_i,
726
+ bounding_box: build_annotation_bbox(a_hash['bounding_box'])
727
+ )
728
+ end
729
+
730
+ def build_annotation_bbox(bbox_data)
731
+ return nil if bbox_data.nil?
732
+
733
+ PdfAnnotationBoundingBox.new(
734
+ left: bbox_data['left']&.to_f,
735
+ top: bbox_data['top']&.to_f,
736
+ right: bbox_data['right']&.to_f,
737
+ bottom: bbox_data['bottom']&.to_f
738
+ )
739
+ end
710
740
  end
711
741
  # rubocop:enable Metrics/ClassLength
712
742
  end
@@ -411,4 +411,24 @@ module Kreuzberg
411
411
 
412
412
  const :nodes, T::Array[DocumentNode]
413
413
  end
414
+
415
+ # Bounding box for a PDF annotation.
416
+ class PdfAnnotationBoundingBox < T::Struct
417
+ extend T::Sig
418
+
419
+ const :left, T.nilable(Float)
420
+ const :top, T.nilable(Float)
421
+ const :right, T.nilable(Float)
422
+ const :bottom, T.nilable(Float)
423
+ end
424
+
425
+ # A PDF annotation extracted from a document page.
426
+ class PdfAnnotation < T::Struct
427
+ extend T::Sig
428
+
429
+ const :annotation_type, String
430
+ const :content, T.nilable(String)
431
+ const :page_number, T.nilable(Integer)
432
+ const :bounding_box, T.nilable(PdfAnnotationBoundingBox)
433
+ end
414
434
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.3.6'
4
+ VERSION = '4.3.7'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -323,8 +323,11 @@ module Kreuzberg
323
323
  attr_reader extract_metadata: bool
324
324
  attr_reader font_config: FontConfig?
325
325
  attr_reader hierarchy: Hierarchy?
326
+ attr_reader extract_annotations: bool
327
+ attr_reader top_margin_fraction: Float?
328
+ attr_reader bottom_margin_fraction: Float?
326
329
 
327
- def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?) -> void
330
+ def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?) -> void
328
331
  def to_h: () -> Hash[Symbol, untyped]
329
332
  end
330
333
 
@@ -525,6 +528,15 @@ module Kreuzberg
525
528
  end
526
529
 
527
530
  # Extraction result type
531
+ type pdf_annotation_type = 'text' | 'highlight' | 'link' | 'stamp' | 'underline' | 'strike_out' | 'other'
532
+
533
+ type pdf_annotation_hash = {
534
+ annotation_type: String,
535
+ content: String?,
536
+ page_number: Integer,
537
+ bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?
538
+ }
539
+
528
540
  type extraction_result_hash = {
529
541
  content: String,
530
542
  mime_type: String,
@@ -541,7 +553,8 @@ module Kreuzberg
541
553
  document: document_structure_hash?,
542
554
  extracted_keywords: Array[extracted_keyword_hash]?,
543
555
  quality_score: Float?,
544
- processing_warnings: Array[processing_warning_hash]?
556
+ processing_warnings: Array[processing_warning_hash]?,
557
+ annotations: Array[pdf_annotation_hash]?
545
558
  }
546
559
 
547
560
  type extracted_keyword_hash = {
@@ -1076,6 +1089,18 @@ module Kreuzberg
1076
1089
  attr_reader extracted_keywords: Array[ExtractedKeyword]?
1077
1090
  attr_reader quality_score: Float?
1078
1091
  attr_reader processing_warnings: Array[ProcessingWarning]?
1092
+ attr_reader annotations: Array[PdfAnnotation]?
1093
+
1094
+ # PDF annotation extracted from a document page (Struct from result.rb)
1095
+ class PdfAnnotation
1096
+ attr_reader annotation_type: String
1097
+ attr_reader content: String?
1098
+ attr_reader page_number: Integer
1099
+ attr_reader bounding_box: BoundingBox?
1100
+
1101
+ def initialize: (annotation_type: String, content: String?, page_number: Integer, bounding_box: BoundingBox?) -> void
1102
+ def to_h: () -> pdf_annotation_hash
1103
+ end
1079
1104
 
1080
1105
  def initialize: (extraction_result_hash hash) -> void
1081
1106
  def to_h: () -> Hash[Symbol, untyped]
@@ -1113,6 +1138,7 @@ module Kreuzberg
1113
1138
  def parse_ocr_geometry: (Hash[String, untyped]? data) -> OcrBoundingGeometry?
1114
1139
  def parse_ocr_confidence: (Hash[String, untyped]? data) -> OcrConfidence?
1115
1140
  def parse_ocr_rotation: (Hash[String, untyped]? data) -> OcrRotation?
1141
+ def parse_annotations: (Array[pdf_annotation_hash]? annotations_data) -> Array[PdfAnnotation]?
1116
1142
  end
1117
1143
 
1118
1144
  # Module methods (extraction API)
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.3.6"
5
+ version = "4.3.7"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -37,7 +37,7 @@ serde_json = { version = "1.0.149" }
37
37
  tempfile = "3.25.0"
38
38
  thiserror = "2.0.18"
39
39
  tokio = { version = "1.49.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
40
- toml = "1.0.2"
40
+ toml = "1.0.3"
41
41
  tracing = "0.1"
42
42
  wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
43
43
  wasm-bindgen-futures = "0.4"