kreuzberg 4.3.6 → 4.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +65 -28
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/src/config/types.rs +29 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
- data/ext/kreuzberg_rb/native/src/result.rs +33 -0
- data/lib/kreuzberg/config.rb +13 -3
- data/lib/kreuzberg/result.rb +32 -2
- data/lib/kreuzberg/types.rb +20 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +28 -2
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +8 -8
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +5 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +32 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/extractor/batch.rs +2 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +1 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +26 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +94 -0
- data/vendor/kreuzberg/src/extraction/image_ocr.rs +1 -0
- data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +1 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +4 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/citation.rs +2 -0
- data/vendor/kreuzberg/src/extractors/csv.rs +1 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +1 -0
- data/vendor/kreuzberg/src/extractors/doc.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +1 -0
- data/vendor/kreuzberg/src/extractors/email.rs +1 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +21 -8
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +1 -0
- data/vendor/kreuzberg/src/extractors/image.rs +2 -0
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/odt.rs +1 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +297 -4
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +7 -0
- data/vendor/kreuzberg/src/extractors/ppt.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +1 -0
- data/vendor/kreuzberg/src/extractors/text.rs +2 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +7 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +5 -0
- data/vendor/kreuzberg/src/mcp/format.rs +4 -0
- data/vendor/kreuzberg/src/mcp/params.rs +20 -0
- data/vendor/kreuzberg/src/mcp/server.rs +8 -2
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +8 -8
- data/vendor/kreuzberg/src/ocr/table/mod.rs +26 -5
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +1 -0
- data/vendor/kreuzberg/src/pdf/annotations.rs +177 -0
- data/vendor/kreuzberg/src/pdf/markdown/assembly.rs +5 -14
- data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +420 -51
- data/vendor/kreuzberg/src/pdf/markdown/classify.rs +1 -9
- data/vendor/kreuzberg/src/pdf/markdown/columns.rs +182 -0
- data/vendor/kreuzberg/src/pdf/markdown/constants.rs +2 -2
- data/vendor/kreuzberg/src/pdf/markdown/lines.rs +10 -25
- data/vendor/kreuzberg/src/pdf/markdown/mod.rs +2 -1
- data/vendor/kreuzberg/src/pdf/markdown/paragraphs.rs +2 -21
- data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +56 -20
- data/vendor/kreuzberg/src/pdf/markdown/render.rs +0 -27
- data/vendor/kreuzberg/src/pdf/markdown/types.rs +0 -7
- data/vendor/kreuzberg/src/pdf/mod.rs +4 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +2 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +1 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +2 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +11 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +5 -0
- data/vendor/kreuzberg/src/types/annotations.rs +41 -0
- data/vendor/kreuzberg/src/types/extraction.rs +9 -0
- data/vendor/kreuzberg/src/types/mod.rs +2 -0
- data/vendor/kreuzberg/tests/dump_pdf_markdown.rs +33 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +12 -0
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +3 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +40 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +17 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +7 -4
- data/vendor/kreuzberg-ffi/src/memory.rs +9 -1
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -0
- data/vendor/kreuzberg-ffi/src/result.rs +1 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +2 -0
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-ffi/src/validation.rs +1 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/tests/diagnostic.rs +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/paragraph.rs +78 -142
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1c9f9cd70dd541fd9c193c1ab60782eb04e287019f707eeb0d6eb853f64fc039
|
|
4
|
+
data.tar.gz: ce8345437d8e47062a21799b0352ceb2d5917a0546421b39a928fe1b659dc28d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 012fac8575af2561d8b114649ef370ee281e90629a3ac95abad398dc3371853109e76e19b4aca9aeeba315853cae72f2ff9788681bd7b1bcfa3cb8245dbaa939
|
|
7
|
+
data.tar.gz: fce6aa4f32a4651821d20ba61c77d6b61fc1038492e4b7d960c585ce2c131dd228430835fcb35f945998c64ae3faa71655ab5748b6243e2c70453e9505a442a0
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.3.
|
|
4
|
+
kreuzberg (4.3.7)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -124,7 +124,7 @@ GEM
|
|
|
124
124
|
rubocop (~> 1.81)
|
|
125
125
|
ruby-progressbar (1.13.0)
|
|
126
126
|
securerandom (0.4.1)
|
|
127
|
-
sorbet-runtime (0.6.
|
|
127
|
+
sorbet-runtime (0.6.12956)
|
|
128
128
|
steep (1.10.0)
|
|
129
129
|
activesupport (>= 5.1)
|
|
130
130
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -210,7 +210,7 @@ CHECKSUMS
|
|
|
210
210
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
211
211
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
212
212
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
213
|
-
kreuzberg (4.3.
|
|
213
|
+
kreuzberg (4.3.7)
|
|
214
214
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
215
215
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
216
216
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -245,7 +245,7 @@ CHECKSUMS
|
|
|
245
245
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
246
246
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
247
247
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
248
|
-
sorbet-runtime (0.6.
|
|
248
|
+
sorbet-runtime (0.6.12956) sha256=fee716a62d0b1d94ebc8e6ba23e76a7654eeac66c1f5cc1e1bef78b8e9ff87c7
|
|
249
249
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
250
250
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
251
251
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.7" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -16,7 +16,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
|
|
|
16
16
|
dependencies = [
|
|
17
17
|
"cfg-if",
|
|
18
18
|
"cipher",
|
|
19
|
-
"cpufeatures",
|
|
19
|
+
"cpufeatures 0.2.17",
|
|
20
20
|
]
|
|
21
21
|
|
|
22
22
|
[[package]]
|
|
@@ -795,6 +795,17 @@ version = "0.2.1"
|
|
|
795
795
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
796
796
|
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
|
|
797
797
|
|
|
798
|
+
[[package]]
|
|
799
|
+
name = "chacha20"
|
|
800
|
+
version = "0.10.0"
|
|
801
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
802
|
+
checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
|
|
803
|
+
dependencies = [
|
|
804
|
+
"cfg-if",
|
|
805
|
+
"cpufeatures 0.3.0",
|
|
806
|
+
"rand_core 0.10.0",
|
|
807
|
+
]
|
|
808
|
+
|
|
798
809
|
[[package]]
|
|
799
810
|
name = "chardetng"
|
|
800
811
|
version = "0.1.17"
|
|
@@ -1099,6 +1110,15 @@ dependencies = [
|
|
|
1099
1110
|
"libc",
|
|
1100
1111
|
]
|
|
1101
1112
|
|
|
1113
|
+
[[package]]
|
|
1114
|
+
name = "cpufeatures"
|
|
1115
|
+
version = "0.3.0"
|
|
1116
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1117
|
+
checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
|
|
1118
|
+
dependencies = [
|
|
1119
|
+
"libc",
|
|
1120
|
+
]
|
|
1121
|
+
|
|
1102
1122
|
[[package]]
|
|
1103
1123
|
name = "crc"
|
|
1104
1124
|
version = "3.3.0"
|
|
@@ -1632,9 +1652,9 @@ dependencies = [
|
|
|
1632
1652
|
|
|
1633
1653
|
[[package]]
|
|
1634
1654
|
name = "fastembed"
|
|
1635
|
-
version = "5.
|
|
1655
|
+
version = "5.11.0"
|
|
1636
1656
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1637
|
-
checksum = "
|
|
1657
|
+
checksum = "b4339d45a80579ab8305616a501eacdbf18fb0f7def7fa6e4c0b75941416d5b0"
|
|
1638
1658
|
dependencies = [
|
|
1639
1659
|
"anyhow",
|
|
1640
1660
|
"hf-hub",
|
|
@@ -1961,6 +1981,7 @@ dependencies = [
|
|
|
1961
1981
|
"js-sys",
|
|
1962
1982
|
"libc",
|
|
1963
1983
|
"r-efi",
|
|
1984
|
+
"rand_core 0.10.0",
|
|
1964
1985
|
"wasip2",
|
|
1965
1986
|
"wasip3",
|
|
1966
1987
|
"wasm-bindgen",
|
|
@@ -2258,9 +2279,9 @@ dependencies = [
|
|
|
2258
2279
|
|
|
2259
2280
|
[[package]]
|
|
2260
2281
|
name = "html-to-markdown-rs"
|
|
2261
|
-
version = "2.25.
|
|
2282
|
+
version = "2.25.1"
|
|
2262
2283
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2263
|
-
checksum = "
|
|
2284
|
+
checksum = "c05335c6bf406653110ad8447c84461c6d0cda5e0aff9d3d3518f87502d30abe"
|
|
2264
2285
|
dependencies = [
|
|
2265
2286
|
"ahash",
|
|
2266
2287
|
"astral-tl",
|
|
@@ -2880,7 +2901,7 @@ dependencies = [
|
|
|
2880
2901
|
|
|
2881
2902
|
[[package]]
|
|
2882
2903
|
name = "kreuzberg"
|
|
2883
|
-
version = "4.3.
|
|
2904
|
+
version = "4.3.6"
|
|
2884
2905
|
dependencies = [
|
|
2885
2906
|
"ahash",
|
|
2886
2907
|
"async-trait",
|
|
@@ -2955,7 +2976,7 @@ dependencies = [
|
|
|
2955
2976
|
"thiserror 2.0.18",
|
|
2956
2977
|
"tiff 0.11.0",
|
|
2957
2978
|
"tokio",
|
|
2958
|
-
"toml 1.0.
|
|
2979
|
+
"toml 1.0.3+spec-1.1.0",
|
|
2959
2980
|
"tower",
|
|
2960
2981
|
"tower-http",
|
|
2961
2982
|
"tracing",
|
|
@@ -2967,12 +2988,12 @@ dependencies = [
|
|
|
2967
2988
|
"uuid",
|
|
2968
2989
|
"whatlang",
|
|
2969
2990
|
"yake-rust",
|
|
2970
|
-
"zip 8.
|
|
2991
|
+
"zip 8.1.0",
|
|
2971
2992
|
]
|
|
2972
2993
|
|
|
2973
2994
|
[[package]]
|
|
2974
2995
|
name = "kreuzberg-ffi"
|
|
2975
|
-
version = "4.3.
|
|
2996
|
+
version = "4.3.6"
|
|
2976
2997
|
dependencies = [
|
|
2977
2998
|
"ahash",
|
|
2978
2999
|
"async-trait",
|
|
@@ -2988,7 +3009,7 @@ dependencies = [
|
|
|
2988
3009
|
|
|
2989
3010
|
[[package]]
|
|
2990
3011
|
name = "kreuzberg-paddle-ocr"
|
|
2991
|
-
version = "4.3.
|
|
3012
|
+
version = "4.3.6"
|
|
2992
3013
|
dependencies = [
|
|
2993
3014
|
"geo-clipper",
|
|
2994
3015
|
"geo-types",
|
|
@@ -3003,7 +3024,7 @@ dependencies = [
|
|
|
3003
3024
|
|
|
3004
3025
|
[[package]]
|
|
3005
3026
|
name = "kreuzberg-pdfium-render"
|
|
3006
|
-
version = "4.3.
|
|
3027
|
+
version = "4.3.6"
|
|
3007
3028
|
dependencies = [
|
|
3008
3029
|
"bitflags",
|
|
3009
3030
|
"bytemuck",
|
|
@@ -3027,7 +3048,7 @@ dependencies = [
|
|
|
3027
3048
|
|
|
3028
3049
|
[[package]]
|
|
3029
3050
|
name = "kreuzberg-rb"
|
|
3030
|
-
version = "4.3.
|
|
3051
|
+
version = "4.3.6"
|
|
3031
3052
|
dependencies = [
|
|
3032
3053
|
"async-trait",
|
|
3033
3054
|
"html-to-markdown-rs",
|
|
@@ -3044,14 +3065,14 @@ dependencies = [
|
|
|
3044
3065
|
|
|
3045
3066
|
[[package]]
|
|
3046
3067
|
name = "kreuzberg-tesseract"
|
|
3047
|
-
version = "4.3.
|
|
3068
|
+
version = "4.3.6"
|
|
3048
3069
|
dependencies = [
|
|
3049
3070
|
"cc",
|
|
3050
3071
|
"cmake",
|
|
3051
3072
|
"libc",
|
|
3052
3073
|
"reqwest 0.13.2",
|
|
3053
3074
|
"thiserror 2.0.18",
|
|
3054
|
-
"zip 8.
|
|
3075
|
+
"zip 8.1.0",
|
|
3055
3076
|
]
|
|
3056
3077
|
|
|
3057
3078
|
[[package]]
|
|
@@ -4985,6 +5006,17 @@ dependencies = [
|
|
|
4985
5006
|
"rand_core 0.9.3",
|
|
4986
5007
|
]
|
|
4987
5008
|
|
|
5009
|
+
[[package]]
|
|
5010
|
+
name = "rand"
|
|
5011
|
+
version = "0.10.0"
|
|
5012
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5013
|
+
checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8"
|
|
5014
|
+
dependencies = [
|
|
5015
|
+
"chacha20",
|
|
5016
|
+
"getrandom 0.4.1",
|
|
5017
|
+
"rand_core 0.10.0",
|
|
5018
|
+
]
|
|
5019
|
+
|
|
4988
5020
|
[[package]]
|
|
4989
5021
|
name = "rand_chacha"
|
|
4990
5022
|
version = "0.9.0"
|
|
@@ -5010,6 +5042,12 @@ dependencies = [
|
|
|
5010
5042
|
"getrandom 0.3.4",
|
|
5011
5043
|
]
|
|
5012
5044
|
|
|
5045
|
+
[[package]]
|
|
5046
|
+
name = "rand_core"
|
|
5047
|
+
version = "0.10.0"
|
|
5048
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5049
|
+
checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"
|
|
5050
|
+
|
|
5013
5051
|
[[package]]
|
|
5014
5052
|
name = "rand_distr"
|
|
5015
5053
|
version = "0.5.1"
|
|
@@ -5362,12 +5400,11 @@ dependencies = [
|
|
|
5362
5400
|
|
|
5363
5401
|
[[package]]
|
|
5364
5402
|
name = "rmcp"
|
|
5365
|
-
version = "0.
|
|
5403
|
+
version = "0.16.0"
|
|
5366
5404
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5367
|
-
checksum = "
|
|
5405
|
+
checksum = "cc4c9c94680f75470ee8083a0667988b5d7b5beb70b9f998a8e51de7c682ce60"
|
|
5368
5406
|
dependencies = [
|
|
5369
5407
|
"async-trait",
|
|
5370
|
-
"axum",
|
|
5371
5408
|
"base64 0.22.1",
|
|
5372
5409
|
"bytes",
|
|
5373
5410
|
"chrono",
|
|
@@ -5377,7 +5414,7 @@ dependencies = [
|
|
|
5377
5414
|
"http-body-util",
|
|
5378
5415
|
"pastey 0.2.1",
|
|
5379
5416
|
"pin-project-lite",
|
|
5380
|
-
"rand 0.
|
|
5417
|
+
"rand 0.10.0",
|
|
5381
5418
|
"rmcp-macros",
|
|
5382
5419
|
"schemars",
|
|
5383
5420
|
"serde",
|
|
@@ -5394,9 +5431,9 @@ dependencies = [
|
|
|
5394
5431
|
|
|
5395
5432
|
[[package]]
|
|
5396
5433
|
name = "rmcp-macros"
|
|
5397
|
-
version = "0.
|
|
5434
|
+
version = "0.16.0"
|
|
5398
5435
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5399
|
-
checksum = "
|
|
5436
|
+
checksum = "90c23c8f26cae4da838fbc3eadfaecf2d549d97c04b558e7bd90526a9c28b42a"
|
|
5400
5437
|
dependencies = [
|
|
5401
5438
|
"darling 0.23.0",
|
|
5402
5439
|
"proc-macro2",
|
|
@@ -5861,7 +5898,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
5861
5898
|
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
|
|
5862
5899
|
dependencies = [
|
|
5863
5900
|
"cfg-if",
|
|
5864
|
-
"cpufeatures",
|
|
5901
|
+
"cpufeatures 0.2.17",
|
|
5865
5902
|
"digest",
|
|
5866
5903
|
]
|
|
5867
5904
|
|
|
@@ -5872,7 +5909,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
5872
5909
|
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
|
|
5873
5910
|
dependencies = [
|
|
5874
5911
|
"cfg-if",
|
|
5875
|
-
"cpufeatures",
|
|
5912
|
+
"cpufeatures 0.2.17",
|
|
5876
5913
|
"digest",
|
|
5877
5914
|
]
|
|
5878
5915
|
|
|
@@ -6584,9 +6621,9 @@ dependencies = [
|
|
|
6584
6621
|
|
|
6585
6622
|
[[package]]
|
|
6586
6623
|
name = "toml"
|
|
6587
|
-
version = "1.0.
|
|
6624
|
+
version = "1.0.3+spec-1.1.0"
|
|
6588
6625
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6589
|
-
checksum = "
|
|
6626
|
+
checksum = "c7614eaf19ad818347db24addfa201729cf2a9b6fdfd9eb0ab870fcacc606c0c"
|
|
6590
6627
|
dependencies = [
|
|
6591
6628
|
"indexmap",
|
|
6592
6629
|
"serde_core",
|
|
@@ -6640,9 +6677,9 @@ dependencies = [
|
|
|
6640
6677
|
|
|
6641
6678
|
[[package]]
|
|
6642
6679
|
name = "toml_parser"
|
|
6643
|
-
version = "1.0.
|
|
6680
|
+
version = "1.0.9+spec-1.1.0"
|
|
6644
6681
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6645
|
-
checksum = "
|
|
6682
|
+
checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4"
|
|
6646
6683
|
dependencies = [
|
|
6647
6684
|
"winnow",
|
|
6648
6685
|
]
|
|
@@ -8072,9 +8109,9 @@ dependencies = [
|
|
|
8072
8109
|
|
|
8073
8110
|
[[package]]
|
|
8074
8111
|
name = "zip"
|
|
8075
|
-
version = "8.
|
|
8112
|
+
version = "8.1.0"
|
|
8076
8113
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
8077
|
-
checksum = "
|
|
8114
|
+
checksum = "6e499faf5c6b97a0d086f4a8733de6d47aee2252b8127962439d8d4311a73f72"
|
|
8078
8115
|
dependencies = [
|
|
8079
8116
|
"aes",
|
|
8080
8117
|
"bzip2",
|
|
@@ -240,11 +240,40 @@ pub fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
|
|
|
240
240
|
None
|
|
241
241
|
};
|
|
242
242
|
|
|
243
|
+
let extract_annotations = if let Some(val) = get_kw(ruby, hash, "extract_annotations") {
|
|
244
|
+
bool::try_convert(val)?
|
|
245
|
+
} else {
|
|
246
|
+
false
|
|
247
|
+
};
|
|
248
|
+
|
|
249
|
+
let top_margin_fraction = if let Some(val) = get_kw(ruby, hash, "top_margin_fraction") {
|
|
250
|
+
if !val.is_nil() {
|
|
251
|
+
Some(f32::try_convert(val)?)
|
|
252
|
+
} else {
|
|
253
|
+
None
|
|
254
|
+
}
|
|
255
|
+
} else {
|
|
256
|
+
None
|
|
257
|
+
};
|
|
258
|
+
|
|
259
|
+
let bottom_margin_fraction = if let Some(val) = get_kw(ruby, hash, "bottom_margin_fraction") {
|
|
260
|
+
if !val.is_nil() {
|
|
261
|
+
Some(f32::try_convert(val)?)
|
|
262
|
+
} else {
|
|
263
|
+
None
|
|
264
|
+
}
|
|
265
|
+
} else {
|
|
266
|
+
None
|
|
267
|
+
};
|
|
268
|
+
|
|
243
269
|
let config = PdfConfig {
|
|
244
270
|
extract_images,
|
|
245
271
|
passwords,
|
|
246
272
|
extract_metadata,
|
|
247
273
|
hierarchy,
|
|
274
|
+
extract_annotations,
|
|
275
|
+
top_margin_fraction,
|
|
276
|
+
bottom_margin_fraction,
|
|
248
277
|
};
|
|
249
278
|
|
|
250
279
|
Ok(config)
|
|
@@ -640,5 +640,38 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
|
|
|
640
640
|
}
|
|
641
641
|
set_hash_entry(ruby, &hash, "processing_warnings", warnings_array.into_value_with(ruby))?;
|
|
642
642
|
|
|
643
|
+
// Convert annotations
|
|
644
|
+
if let Some(annotations) = result.annotations {
|
|
645
|
+
let annotations_array = ruby.ary_new();
|
|
646
|
+
for annot in annotations {
|
|
647
|
+
let annot_hash = ruby.hash_new();
|
|
648
|
+
let type_str = serde_json::to_value(&annot.annotation_type)
|
|
649
|
+
.ok()
|
|
650
|
+
.and_then(|v| v.as_str().map(String::from))
|
|
651
|
+
.unwrap_or_default();
|
|
652
|
+
annot_hash.aset("annotation_type", type_str.as_str())?;
|
|
653
|
+
if let Some(content) = annot.content {
|
|
654
|
+
annot_hash.aset("content", content.as_str())?;
|
|
655
|
+
} else {
|
|
656
|
+
annot_hash.aset("content", ruby.qnil().as_value())?;
|
|
657
|
+
}
|
|
658
|
+
annot_hash.aset("page_number", annot.page_number as i64)?;
|
|
659
|
+
if let Some(bbox) = annot.bounding_box {
|
|
660
|
+
let bbox_hash = ruby.hash_new();
|
|
661
|
+
bbox_hash.aset("x0", bbox.x0)?;
|
|
662
|
+
bbox_hash.aset("y0", bbox.y0)?;
|
|
663
|
+
bbox_hash.aset("x1", bbox.x1)?;
|
|
664
|
+
bbox_hash.aset("y1", bbox.y1)?;
|
|
665
|
+
annot_hash.aset("bounding_box", bbox_hash)?;
|
|
666
|
+
} else {
|
|
667
|
+
annot_hash.aset("bounding_box", ruby.qnil().as_value())?;
|
|
668
|
+
}
|
|
669
|
+
annotations_array.push(annot_hash)?;
|
|
670
|
+
}
|
|
671
|
+
set_hash_entry(ruby, &hash, "annotations", annotations_array.into_value_with(ruby))?;
|
|
672
|
+
} else {
|
|
673
|
+
set_hash_entry(ruby, &hash, "annotations", ruby.qnil().as_value())?;
|
|
674
|
+
}
|
|
675
|
+
|
|
643
676
|
Ok(hash)
|
|
644
677
|
}
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -391,14 +391,18 @@ module Kreuzberg
|
|
|
391
391
|
# pdf = PDF.new(extract_images: true, hierarchy: hierarchy)
|
|
392
392
|
#
|
|
393
393
|
class PDF
|
|
394
|
-
attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy
|
|
394
|
+
attr_reader :extract_images, :passwords, :extract_metadata, :font_config, :hierarchy,
|
|
395
|
+
:extract_annotations, :top_margin_fraction, :bottom_margin_fraction
|
|
395
396
|
|
|
396
397
|
def initialize(
|
|
397
398
|
extract_images: false,
|
|
398
399
|
passwords: nil,
|
|
399
400
|
extract_metadata: true,
|
|
400
401
|
font_config: nil,
|
|
401
|
-
hierarchy: nil
|
|
402
|
+
hierarchy: nil,
|
|
403
|
+
extract_annotations: false,
|
|
404
|
+
top_margin_fraction: nil,
|
|
405
|
+
bottom_margin_fraction: nil
|
|
402
406
|
)
|
|
403
407
|
@extract_images = extract_images ? true : false
|
|
404
408
|
@passwords = if passwords.is_a?(Array)
|
|
@@ -409,6 +413,9 @@ module Kreuzberg
|
|
|
409
413
|
@extract_metadata = extract_metadata ? true : false
|
|
410
414
|
@font_config = normalize_font_config(font_config)
|
|
411
415
|
@hierarchy = normalize_hierarchy(hierarchy)
|
|
416
|
+
@extract_annotations = extract_annotations ? true : false
|
|
417
|
+
@top_margin_fraction = top_margin_fraction&.to_f
|
|
418
|
+
@bottom_margin_fraction = bottom_margin_fraction&.to_f
|
|
412
419
|
end
|
|
413
420
|
|
|
414
421
|
def to_h
|
|
@@ -417,7 +424,10 @@ module Kreuzberg
|
|
|
417
424
|
passwords: @passwords,
|
|
418
425
|
extract_metadata: @extract_metadata,
|
|
419
426
|
font_config: @font_config&.to_h,
|
|
420
|
-
hierarchy: @hierarchy&.to_h
|
|
427
|
+
hierarchy: @hierarchy&.to_h,
|
|
428
|
+
extract_annotations: @extract_annotations,
|
|
429
|
+
top_margin_fraction: @top_margin_fraction,
|
|
430
|
+
bottom_margin_fraction: @bottom_margin_fraction
|
|
421
431
|
}.compact
|
|
422
432
|
end
|
|
423
433
|
|
data/lib/kreuzberg/result.rb
CHANGED
|
@@ -14,7 +14,7 @@ module Kreuzberg
|
|
|
14
14
|
class Result
|
|
15
15
|
attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
|
|
16
16
|
:detected_languages, :chunks, :images, :pages, :elements, :ocr_elements, :djot_content,
|
|
17
|
-
:document, :extracted_keywords, :quality_score, :processing_warnings
|
|
17
|
+
:document, :extracted_keywords, :quality_score, :processing_warnings, :annotations
|
|
18
18
|
|
|
19
19
|
# @!attribute [r] cells
|
|
20
20
|
# @return [Array<Array<String>>] Table cells (2D array)
|
|
@@ -339,6 +339,7 @@ module Kreuzberg
|
|
|
339
339
|
@extracted_keywords = parse_extracted_keywords(get_value(hash, 'extracted_keywords'))
|
|
340
340
|
@quality_score = get_value(hash, 'quality_score')
|
|
341
341
|
@processing_warnings = parse_processing_warnings(get_value(hash, 'processing_warnings'))
|
|
342
|
+
@annotations = parse_annotations(get_value(hash, 'annotations'))
|
|
342
343
|
end
|
|
343
344
|
# rubocop:enable Metrics/AbcSize
|
|
344
345
|
|
|
@@ -346,6 +347,7 @@ module Kreuzberg
|
|
|
346
347
|
#
|
|
347
348
|
# @return [Hash] Hash representation
|
|
348
349
|
#
|
|
350
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
349
351
|
def to_h
|
|
350
352
|
{
|
|
351
353
|
content: @content,
|
|
@@ -362,9 +364,11 @@ module Kreuzberg
|
|
|
362
364
|
document: @document&.to_h,
|
|
363
365
|
extracted_keywords: @extracted_keywords&.map(&:to_h),
|
|
364
366
|
quality_score: @quality_score,
|
|
365
|
-
processing_warnings: @processing_warnings.map(&:to_h)
|
|
367
|
+
processing_warnings: @processing_warnings.map(&:to_h),
|
|
368
|
+
annotations: @annotations&.map(&:to_h)
|
|
366
369
|
}
|
|
367
370
|
end
|
|
371
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
368
372
|
|
|
369
373
|
# Convert to JSON
|
|
370
374
|
#
|
|
@@ -707,6 +711,32 @@ module Kreuzberg
|
|
|
707
711
|
)
|
|
708
712
|
end
|
|
709
713
|
end
|
|
714
|
+
|
|
715
|
+
def parse_annotations(annotations_data)
|
|
716
|
+
return nil if annotations_data.nil?
|
|
717
|
+
|
|
718
|
+
annotations_data.map { |a_hash| build_annotation(a_hash) }
|
|
719
|
+
end
|
|
720
|
+
|
|
721
|
+
def build_annotation(a_hash)
|
|
722
|
+
PdfAnnotation.new(
|
|
723
|
+
annotation_type: a_hash['annotation_type'] || '',
|
|
724
|
+
content: a_hash['content'],
|
|
725
|
+
page_number: a_hash['page_number']&.to_i,
|
|
726
|
+
bounding_box: build_annotation_bbox(a_hash['bounding_box'])
|
|
727
|
+
)
|
|
728
|
+
end
|
|
729
|
+
|
|
730
|
+
def build_annotation_bbox(bbox_data)
|
|
731
|
+
return nil if bbox_data.nil?
|
|
732
|
+
|
|
733
|
+
PdfAnnotationBoundingBox.new(
|
|
734
|
+
left: bbox_data['left']&.to_f,
|
|
735
|
+
top: bbox_data['top']&.to_f,
|
|
736
|
+
right: bbox_data['right']&.to_f,
|
|
737
|
+
bottom: bbox_data['bottom']&.to_f
|
|
738
|
+
)
|
|
739
|
+
end
|
|
710
740
|
end
|
|
711
741
|
# rubocop:enable Metrics/ClassLength
|
|
712
742
|
end
|
data/lib/kreuzberg/types.rb
CHANGED
|
@@ -411,4 +411,24 @@ module Kreuzberg
|
|
|
411
411
|
|
|
412
412
|
const :nodes, T::Array[DocumentNode]
|
|
413
413
|
end
|
|
414
|
+
|
|
415
|
+
# Bounding box for a PDF annotation.
|
|
416
|
+
class PdfAnnotationBoundingBox < T::Struct
|
|
417
|
+
extend T::Sig
|
|
418
|
+
|
|
419
|
+
const :left, T.nilable(Float)
|
|
420
|
+
const :top, T.nilable(Float)
|
|
421
|
+
const :right, T.nilable(Float)
|
|
422
|
+
const :bottom, T.nilable(Float)
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
# A PDF annotation extracted from a document page.
|
|
426
|
+
class PdfAnnotation < T::Struct
|
|
427
|
+
extend T::Sig
|
|
428
|
+
|
|
429
|
+
const :annotation_type, String
|
|
430
|
+
const :content, T.nilable(String)
|
|
431
|
+
const :page_number, T.nilable(Integer)
|
|
432
|
+
const :bounding_box, T.nilable(PdfAnnotationBoundingBox)
|
|
433
|
+
end
|
|
414
434
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -323,8 +323,11 @@ module Kreuzberg
|
|
|
323
323
|
attr_reader extract_metadata: bool
|
|
324
324
|
attr_reader font_config: FontConfig?
|
|
325
325
|
attr_reader hierarchy: Hierarchy?
|
|
326
|
+
attr_reader extract_annotations: bool
|
|
327
|
+
attr_reader top_margin_fraction: Float?
|
|
328
|
+
attr_reader bottom_margin_fraction: Float?
|
|
326
329
|
|
|
327
|
-
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?) -> void
|
|
330
|
+
def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?, ?hierarchy: (Hierarchy | Hash[Symbol, untyped])?, ?extract_annotations: bool, ?top_margin_fraction: Float?, ?bottom_margin_fraction: Float?) -> void
|
|
328
331
|
def to_h: () -> Hash[Symbol, untyped]
|
|
329
332
|
end
|
|
330
333
|
|
|
@@ -525,6 +528,15 @@ module Kreuzberg
|
|
|
525
528
|
end
|
|
526
529
|
|
|
527
530
|
# Extraction result type
|
|
531
|
+
type pdf_annotation_type = 'text' | 'highlight' | 'link' | 'stamp' | 'underline' | 'strike_out' | 'other'
|
|
532
|
+
|
|
533
|
+
type pdf_annotation_hash = {
|
|
534
|
+
annotation_type: String,
|
|
535
|
+
content: String?,
|
|
536
|
+
page_number: Integer,
|
|
537
|
+
bounding_box: { x0: Float, y0: Float, x1: Float, y1: Float }?
|
|
538
|
+
}
|
|
539
|
+
|
|
528
540
|
type extraction_result_hash = {
|
|
529
541
|
content: String,
|
|
530
542
|
mime_type: String,
|
|
@@ -541,7 +553,8 @@ module Kreuzberg
|
|
|
541
553
|
document: document_structure_hash?,
|
|
542
554
|
extracted_keywords: Array[extracted_keyword_hash]?,
|
|
543
555
|
quality_score: Float?,
|
|
544
|
-
processing_warnings: Array[processing_warning_hash]
|
|
556
|
+
processing_warnings: Array[processing_warning_hash]?,
|
|
557
|
+
annotations: Array[pdf_annotation_hash]?
|
|
545
558
|
}
|
|
546
559
|
|
|
547
560
|
type extracted_keyword_hash = {
|
|
@@ -1076,6 +1089,18 @@ module Kreuzberg
|
|
|
1076
1089
|
attr_reader extracted_keywords: Array[ExtractedKeyword]?
|
|
1077
1090
|
attr_reader quality_score: Float?
|
|
1078
1091
|
attr_reader processing_warnings: Array[ProcessingWarning]?
|
|
1092
|
+
attr_reader annotations: Array[PdfAnnotation]?
|
|
1093
|
+
|
|
1094
|
+
# PDF annotation extracted from a document page (Struct from result.rb)
|
|
1095
|
+
class PdfAnnotation
|
|
1096
|
+
attr_reader annotation_type: String
|
|
1097
|
+
attr_reader content: String?
|
|
1098
|
+
attr_reader page_number: Integer
|
|
1099
|
+
attr_reader bounding_box: BoundingBox?
|
|
1100
|
+
|
|
1101
|
+
def initialize: (annotation_type: String, content: String?, page_number: Integer, bounding_box: BoundingBox?) -> void
|
|
1102
|
+
def to_h: () -> pdf_annotation_hash
|
|
1103
|
+
end
|
|
1079
1104
|
|
|
1080
1105
|
def initialize: (extraction_result_hash hash) -> void
|
|
1081
1106
|
def to_h: () -> Hash[Symbol, untyped]
|
|
@@ -1113,6 +1138,7 @@ module Kreuzberg
|
|
|
1113
1138
|
def parse_ocr_geometry: (Hash[String, untyped]? data) -> OcrBoundingGeometry?
|
|
1114
1139
|
def parse_ocr_confidence: (Hash[String, untyped]? data) -> OcrConfidence?
|
|
1115
1140
|
def parse_ocr_rotation: (Hash[String, untyped]? data) -> OcrRotation?
|
|
1141
|
+
def parse_annotations: (Array[pdf_annotation_hash]? annotations_data) -> Array[PdfAnnotation]?
|
|
1116
1142
|
end
|
|
1117
1143
|
|
|
1118
1144
|
# Module methods (extraction API)
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.3.
|
|
5
|
+
version = "4.3.7"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -37,7 +37,7 @@ serde_json = { version = "1.0.149" }
|
|
|
37
37
|
tempfile = "3.25.0"
|
|
38
38
|
thiserror = "2.0.18"
|
|
39
39
|
tokio = { version = "1.49.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
40
|
-
toml = "1.0.
|
|
40
|
+
toml = "1.0.3"
|
|
41
41
|
tracing = "0.1"
|
|
42
42
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
|
43
43
|
wasm-bindgen-futures = "0.4"
|