kreuzberg 4.7.4 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/ext/kreuzberg_rb/native/Cargo.lock +81 -12
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/src/config/types.rs +46 -0
- data/ext/kreuzberg_rb/native/src/embedding.rs +87 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +7 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +5 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
- data/lib/kreuzberg/config.rb +43 -5
- data/lib/kreuzberg/errors.rb +4 -0
- data/lib/kreuzberg/extraction_api.rb +35 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +7 -0
- data/sig/kreuzberg.rbs +20 -0
- data/vendor/Cargo.toml +5 -3
- data/vendor/kreuzberg/Cargo.toml +9 -1
- data/vendor/kreuzberg/README.md +3 -3
- data/vendor/kreuzberg/src/api/error.rs +1 -0
- data/vendor/kreuzberg/src/api/handlers.rs +231 -39
- data/vendor/kreuzberg/src/api/mod.rs +2 -1
- data/vendor/kreuzberg/src/api/openapi.rs +2 -0
- data/vendor/kreuzberg/src/api/router.rs +3 -1
- data/vendor/kreuzberg/src/api/types.rs +13 -0
- data/vendor/kreuzberg/src/chunking/processor.rs +12 -189
- data/vendor/kreuzberg/src/core/config/content_filter.rs +73 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +48 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +149 -37
- data/vendor/kreuzberg/src/core/config/extraction/file_config.rs +12 -0
- data/vendor/kreuzberg/src/core/config/llm.rs +108 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +4 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +32 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +32 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +2 -2
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +131 -1
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +1 -20
- data/vendor/kreuzberg/src/core/pipeline/format.rs +0 -6
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +24 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +6 -10
- data/vendor/kreuzberg/src/embeddings/mod.rs +246 -72
- data/vendor/kreuzberg/src/error.rs +8 -0
- data/vendor/kreuzberg/src/extraction/derive.rs +1 -8
- data/vendor/kreuzberg/src/extraction/doc/mod.rs +23 -0
- data/vendor/kreuzberg/src/extraction/hwp/model.rs +20 -11
- data/vendor/kreuzberg/src/extraction/hwp/parser.rs +39 -4
- data/vendor/kreuzberg/src/extraction/image_ocr.rs +2 -21
- data/vendor/kreuzberg/src/extraction/mod.rs +1 -1
- data/vendor/kreuzberg/src/extraction/ppt/mod.rs +17 -3
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +14 -1
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +56 -1
- data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +3 -44
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +7 -136
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +2 -41
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +6 -1
- data/vendor/kreuzberg/src/extractors/html.rs +45 -3
- data/vendor/kreuzberg/src/extractors/odt.rs +12 -1
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +9 -0
- data/vendor/kreuzberg/src/extractors/ppt.rs +9 -5
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +15 -2
- data/vendor/kreuzberg/src/keywords/processor.rs +13 -160
- data/vendor/kreuzberg/src/language_detection/processor.rs +5 -106
- data/vendor/kreuzberg/src/lib.rs +13 -4
- data/vendor/kreuzberg/src/llm/client.rs +39 -0
- data/vendor/kreuzberg/src/llm/mod.rs +15 -0
- data/vendor/kreuzberg/src/llm/prompts.rs +52 -0
- data/vendor/kreuzberg/src/llm/structured.rs +190 -0
- data/vendor/kreuzberg/src/llm/vlm_embeddings.rs +118 -0
- data/vendor/kreuzberg/src/llm/vlm_ocr.rs +180 -0
- data/vendor/kreuzberg/src/mcp/errors.rs +18 -0
- data/vendor/kreuzberg/src/mcp/format.rs +4 -82
- data/vendor/kreuzberg/src/mcp/mod.rs +2 -1
- data/vendor/kreuzberg/src/mcp/params.rs +38 -0
- data/vendor/kreuzberg/src/mcp/server.rs +122 -53
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -35
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +1 -16
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +36 -12
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +1 -16
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -52
- data/vendor/kreuzberg/src/plugins/ocr.rs +4 -69
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +8 -167
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +13 -44
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +11 -230
- data/vendor/kreuzberg/src/rendering/markdown.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +8 -109
- data/vendor/kreuzberg/src/types/extraction.rs +9 -0
- data/vendor/kreuzberg/tests/llm_integration.rs +295 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +4 -3
- data/vendor/kreuzberg-ffi/README.md +2 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +71 -7
- data/vendor/kreuzberg-ffi/src/config_builder.rs +90 -2
- data/vendor/kreuzberg-ffi/src/embedding.rs +94 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -16
- data/vendor/kreuzberg-ffi/src/helpers.rs +28 -75
- data/vendor/kreuzberg-ffi/src/lib.rs +9 -3
- data/vendor/kreuzberg-ffi/src/memory.rs +4 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +2 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -19
- data/vendor/kreuzberg-ffi/src/result.rs +1 -15
- data/vendor/kreuzberg-ffi/src/result_view.rs +2 -34
- data/vendor/kreuzberg-ffi/src/string_intern.rs +9 -0
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-ffi/tests/c/test_config_builder.c +5 -0
- data/vendor/kreuzberg-ffi/tests/c/test_error.c +4 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +14 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c768f2adf392c1598da39e79b20dcc7f0c55774f2d3063c74c6ec72888742bac
|
|
4
|
+
data.tar.gz: 8a350998762668be79e7f4a3843812782f4b0812fdb41327d8d38f1f57c39073
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 18b2dc0c39199bead5f1f310a38b5f288cba0d8ebb62d31505f13933a0af3bf9179f8377e7a8bd605c39d46e900869a4492f1bd829f853d6f1765263f4affd4d
|
|
7
|
+
data.tar.gz: 769b9c36d5f6722f97f3c3aebb97f1045382a6dbca96405e298c44bc268795d97781edb89d140c1bc2f0e1fb33c5efecb79c2e500469dd7a25fe2105c49081ba
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
<!-- Project Info -->
|
|
44
44
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
45
|
-
<img src="https://img.shields.io/badge/License-
|
|
45
|
+
<img src="https://img.shields.io/badge/License-Elastic--2.0-blue.svg" alt="License">
|
|
46
46
|
</a>
|
|
47
47
|
<a href="https://docs.kreuzberg.dev">
|
|
48
48
|
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
|
|
@@ -419,7 +419,7 @@ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg
|
|
|
419
419
|
|
|
420
420
|
## License
|
|
421
421
|
|
|
422
|
-
|
|
422
|
+
Elastic License 2.0 (ELv2) - see [LICENSE](../../LICENSE) for details.
|
|
423
423
|
|
|
424
424
|
## Support
|
|
425
425
|
|
|
@@ -1565,9 +1565,9 @@ dependencies = [
|
|
|
1565
1565
|
|
|
1566
1566
|
[[package]]
|
|
1567
1567
|
name = "fastrand"
|
|
1568
|
-
version = "2.4.
|
|
1568
|
+
version = "2.4.1"
|
|
1569
1569
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1570
|
-
checksum = "
|
|
1570
|
+
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
|
|
1571
1571
|
|
|
1572
1572
|
[[package]]
|
|
1573
1573
|
name = "fax"
|
|
@@ -2792,7 +2792,7 @@ dependencies = [
|
|
|
2792
2792
|
|
|
2793
2793
|
[[package]]
|
|
2794
2794
|
name = "kreuzberg"
|
|
2795
|
-
version = "4.7.
|
|
2795
|
+
version = "4.7.4"
|
|
2796
2796
|
dependencies = [
|
|
2797
2797
|
"ahash",
|
|
2798
2798
|
"async-trait",
|
|
@@ -2829,12 +2829,14 @@ dependencies = [
|
|
|
2829
2829
|
"kreuzberg-pdfium-render",
|
|
2830
2830
|
"kreuzberg-tesseract",
|
|
2831
2831
|
"libc",
|
|
2832
|
+
"liter-llm",
|
|
2832
2833
|
"log",
|
|
2833
2834
|
"lopdf",
|
|
2834
2835
|
"mail-parser",
|
|
2835
2836
|
"memchr",
|
|
2836
2837
|
"memmap2",
|
|
2837
2838
|
"mime_guess",
|
|
2839
|
+
"minijinja",
|
|
2838
2840
|
"ndarray",
|
|
2839
2841
|
"num_cpus",
|
|
2840
2842
|
"once_cell",
|
|
@@ -2880,12 +2882,12 @@ dependencies = [
|
|
|
2880
2882
|
"utoipa",
|
|
2881
2883
|
"whatlang",
|
|
2882
2884
|
"yake-rust",
|
|
2883
|
-
"zip 8.5.
|
|
2885
|
+
"zip 8.5.1",
|
|
2884
2886
|
]
|
|
2885
2887
|
|
|
2886
2888
|
[[package]]
|
|
2887
2889
|
name = "kreuzberg-ffi"
|
|
2888
|
-
version = "4.7.
|
|
2890
|
+
version = "4.7.4"
|
|
2889
2891
|
dependencies = [
|
|
2890
2892
|
"ahash",
|
|
2891
2893
|
"async-trait",
|
|
@@ -2901,7 +2903,7 @@ dependencies = [
|
|
|
2901
2903
|
|
|
2902
2904
|
[[package]]
|
|
2903
2905
|
name = "kreuzberg-paddle-ocr"
|
|
2904
|
-
version = "4.7.
|
|
2906
|
+
version = "4.7.4"
|
|
2905
2907
|
dependencies = [
|
|
2906
2908
|
"geo-clipper",
|
|
2907
2909
|
"geo-types",
|
|
@@ -2915,7 +2917,7 @@ dependencies = [
|
|
|
2915
2917
|
|
|
2916
2918
|
[[package]]
|
|
2917
2919
|
name = "kreuzberg-pdfium-render"
|
|
2918
|
-
version = "4.7.
|
|
2920
|
+
version = "4.7.4"
|
|
2919
2921
|
dependencies = [
|
|
2920
2922
|
"bitflags",
|
|
2921
2923
|
"bytemuck",
|
|
@@ -2938,7 +2940,7 @@ dependencies = [
|
|
|
2938
2940
|
|
|
2939
2941
|
[[package]]
|
|
2940
2942
|
name = "kreuzberg-rb"
|
|
2941
|
-
version = "4.7.
|
|
2943
|
+
version = "4.7.4"
|
|
2942
2944
|
dependencies = [
|
|
2943
2945
|
"async-trait",
|
|
2944
2946
|
"html-to-markdown-rs",
|
|
@@ -2955,13 +2957,13 @@ dependencies = [
|
|
|
2955
2957
|
|
|
2956
2958
|
[[package]]
|
|
2957
2959
|
name = "kreuzberg-tesseract"
|
|
2958
|
-
version = "4.7.
|
|
2960
|
+
version = "4.7.4"
|
|
2959
2961
|
dependencies = [
|
|
2960
2962
|
"cc",
|
|
2961
2963
|
"cmake",
|
|
2962
2964
|
"reqwest",
|
|
2963
2965
|
"thiserror 2.0.18",
|
|
2964
|
-
"zip 8.5.
|
|
2966
|
+
"zip 8.5.1",
|
|
2965
2967
|
]
|
|
2966
2968
|
|
|
2967
2969
|
[[package]]
|
|
@@ -3060,6 +3062,27 @@ version = "0.8.2"
|
|
|
3060
3062
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3061
3063
|
checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
|
|
3062
3064
|
|
|
3065
|
+
[[package]]
|
|
3066
|
+
name = "liter-llm"
|
|
3067
|
+
version = "1.2.0"
|
|
3068
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3069
|
+
checksum = "0405bbc5926db49a5f73a4f503d9cac19413416c95e2fd736b1bfa8ce9491660"
|
|
3070
|
+
dependencies = [
|
|
3071
|
+
"base64 0.22.1",
|
|
3072
|
+
"bytes",
|
|
3073
|
+
"futures-core",
|
|
3074
|
+
"memchr",
|
|
3075
|
+
"pin-project-lite",
|
|
3076
|
+
"reqwest",
|
|
3077
|
+
"secrecy",
|
|
3078
|
+
"serde",
|
|
3079
|
+
"serde_json",
|
|
3080
|
+
"thiserror 2.0.18",
|
|
3081
|
+
"tokio",
|
|
3082
|
+
"toml 1.1.2+spec-1.1.0",
|
|
3083
|
+
"tracing",
|
|
3084
|
+
]
|
|
3085
|
+
|
|
3063
3086
|
[[package]]
|
|
3064
3087
|
name = "litrs"
|
|
3065
3088
|
version = "1.0.0"
|
|
@@ -3269,6 +3292,12 @@ dependencies = [
|
|
|
3269
3292
|
"libc",
|
|
3270
3293
|
]
|
|
3271
3294
|
|
|
3295
|
+
[[package]]
|
|
3296
|
+
name = "memo-map"
|
|
3297
|
+
version = "0.3.3"
|
|
3298
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3299
|
+
checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b"
|
|
3300
|
+
|
|
3272
3301
|
[[package]]
|
|
3273
3302
|
name = "mime"
|
|
3274
3303
|
version = "0.3.17"
|
|
@@ -3285,6 +3314,16 @@ dependencies = [
|
|
|
3285
3314
|
"unicase",
|
|
3286
3315
|
]
|
|
3287
3316
|
|
|
3317
|
+
[[package]]
|
|
3318
|
+
name = "minijinja"
|
|
3319
|
+
version = "2.19.0"
|
|
3320
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3321
|
+
checksum = "805bfd7352166bae857ee569628b52bcd85a1cecf7810861ebceb1686b72b75d"
|
|
3322
|
+
dependencies = [
|
|
3323
|
+
"memo-map",
|
|
3324
|
+
"serde",
|
|
3325
|
+
]
|
|
3326
|
+
|
|
3288
3327
|
[[package]]
|
|
3289
3328
|
name = "minimal-lexical"
|
|
3290
3329
|
version = "0.2.1"
|
|
@@ -4391,6 +4430,7 @@ dependencies = [
|
|
|
4391
4430
|
"hyper-util",
|
|
4392
4431
|
"js-sys",
|
|
4393
4432
|
"log",
|
|
4433
|
+
"mime_guess",
|
|
4394
4434
|
"native-tls",
|
|
4395
4435
|
"percent-encoding",
|
|
4396
4436
|
"pin-project-lite",
|
|
@@ -4398,16 +4438,21 @@ dependencies = [
|
|
|
4398
4438
|
"rustls",
|
|
4399
4439
|
"rustls-pki-types",
|
|
4400
4440
|
"rustls-platform-verifier",
|
|
4441
|
+
"serde",
|
|
4442
|
+
"serde_json",
|
|
4443
|
+
"serde_urlencoded",
|
|
4401
4444
|
"sync_wrapper",
|
|
4402
4445
|
"tokio",
|
|
4403
4446
|
"tokio-native-tls",
|
|
4404
4447
|
"tokio-rustls",
|
|
4448
|
+
"tokio-util",
|
|
4405
4449
|
"tower",
|
|
4406
4450
|
"tower-http",
|
|
4407
4451
|
"tower-service",
|
|
4408
4452
|
"url",
|
|
4409
4453
|
"wasm-bindgen",
|
|
4410
4454
|
"wasm-bindgen-futures",
|
|
4455
|
+
"wasm-streams",
|
|
4411
4456
|
"web-sys",
|
|
4412
4457
|
]
|
|
4413
4458
|
|
|
@@ -4676,6 +4721,16 @@ version = "1.2.0"
|
|
|
4676
4721
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4677
4722
|
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
|
4678
4723
|
|
|
4724
|
+
[[package]]
|
|
4725
|
+
name = "secrecy"
|
|
4726
|
+
version = "0.10.3"
|
|
4727
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4728
|
+
checksum = "e891af845473308773346dc847b2c23ee78fe442e0472ac50e22a18a93d3ae5a"
|
|
4729
|
+
dependencies = [
|
|
4730
|
+
"serde",
|
|
4731
|
+
"zeroize",
|
|
4732
|
+
]
|
|
4733
|
+
|
|
4679
4734
|
[[package]]
|
|
4680
4735
|
name = "security-framework"
|
|
4681
4736
|
version = "3.7.0"
|
|
@@ -5382,6 +5437,7 @@ dependencies = [
|
|
|
5382
5437
|
"bytes",
|
|
5383
5438
|
"libc",
|
|
5384
5439
|
"mio",
|
|
5440
|
+
"parking_lot",
|
|
5385
5441
|
"pin-project-lite",
|
|
5386
5442
|
"signal-hook-registry",
|
|
5387
5443
|
"socket2",
|
|
@@ -6098,6 +6154,19 @@ dependencies = [
|
|
|
6098
6154
|
"wasmparser",
|
|
6099
6155
|
]
|
|
6100
6156
|
|
|
6157
|
+
[[package]]
|
|
6158
|
+
name = "wasm-streams"
|
|
6159
|
+
version = "0.5.0"
|
|
6160
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6161
|
+
checksum = "9d1ec4f6517c9e11ae630e200b2b65d193279042e28edd4a2cda233e46670bbb"
|
|
6162
|
+
dependencies = [
|
|
6163
|
+
"futures-util",
|
|
6164
|
+
"js-sys",
|
|
6165
|
+
"wasm-bindgen",
|
|
6166
|
+
"wasm-bindgen-futures",
|
|
6167
|
+
"web-sys",
|
|
6168
|
+
]
|
|
6169
|
+
|
|
6101
6170
|
[[package]]
|
|
6102
6171
|
name = "wasmparser"
|
|
6103
6172
|
version = "0.244.0"
|
|
@@ -6788,9 +6857,9 @@ dependencies = [
|
|
|
6788
6857
|
|
|
6789
6858
|
[[package]]
|
|
6790
6859
|
name = "zip"
|
|
6791
|
-
version = "8.5.
|
|
6860
|
+
version = "8.5.1"
|
|
6792
6861
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6793
|
-
checksum = "
|
|
6862
|
+
checksum = "dcab981e19633ebcf0b001ddd37dd802996098bc1864f90b7c5d970ce76c1d59"
|
|
6794
6863
|
dependencies = [
|
|
6795
6864
|
"crc32fast",
|
|
6796
6865
|
"flate2",
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.
|
|
3
|
+
version = "4.8.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
7
|
-
license = "
|
|
7
|
+
license = "Elastic-2.0"
|
|
8
8
|
repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
|
9
9
|
homepage = "https://kreuzberg.dev"
|
|
10
10
|
documentation = "https://docs.rs/kreuzberg"
|
|
@@ -22,6 +22,7 @@ use kreuzberg::{
|
|
|
22
22
|
ImageExtractionConfig, LanguageDetectionConfig, LayoutDetectionConfig, OcrConfig, OutputFormat, PdfConfig,
|
|
23
23
|
PostProcessorConfig, TokenReductionConfig,
|
|
24
24
|
};
|
|
25
|
+
use kreuzberg::core::config::ContentFilterConfig;
|
|
25
26
|
use magnus::value::ReprValue;
|
|
26
27
|
use magnus::{Error, RArray, RHash, Ruby, TryConvert, Value};
|
|
27
28
|
use std::fs;
|
|
@@ -50,6 +51,8 @@ pub fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
|
|
|
50
51
|
auto_rotate: false,
|
|
51
52
|
pipeline: None,
|
|
52
53
|
quality_thresholds: None,
|
|
54
|
+
vlm_config: None,
|
|
55
|
+
vlm_prompt: None,
|
|
53
56
|
};
|
|
54
57
|
|
|
55
58
|
if let Some(val) = get_kw(ruby, hash, "tesseract_config")
|
|
@@ -860,6 +863,42 @@ pub fn parse_email_config(ruby: &Ruby, hash: RHash) -> Result<EmailConfig, Error
|
|
|
860
863
|
Ok(config)
|
|
861
864
|
}
|
|
862
865
|
|
|
866
|
+
/// Parse ContentFilterConfig from Ruby Hash
|
|
867
|
+
pub fn parse_content_filter_config(ruby: &Ruby, hash: RHash) -> Result<ContentFilterConfig, Error> {
|
|
868
|
+
let include_headers = if let Some(val) = get_kw(ruby, hash, "include_headers") {
|
|
869
|
+
bool::try_convert(val)?
|
|
870
|
+
} else {
|
|
871
|
+
false
|
|
872
|
+
};
|
|
873
|
+
|
|
874
|
+
let include_footers = if let Some(val) = get_kw(ruby, hash, "include_footers") {
|
|
875
|
+
bool::try_convert(val)?
|
|
876
|
+
} else {
|
|
877
|
+
false
|
|
878
|
+
};
|
|
879
|
+
|
|
880
|
+
let strip_repeating_text = if let Some(val) = get_kw(ruby, hash, "strip_repeating_text") {
|
|
881
|
+
bool::try_convert(val)?
|
|
882
|
+
} else {
|
|
883
|
+
true
|
|
884
|
+
};
|
|
885
|
+
|
|
886
|
+
let include_watermarks = if let Some(val) = get_kw(ruby, hash, "include_watermarks") {
|
|
887
|
+
bool::try_convert(val)?
|
|
888
|
+
} else {
|
|
889
|
+
false
|
|
890
|
+
};
|
|
891
|
+
|
|
892
|
+
let config = ContentFilterConfig {
|
|
893
|
+
include_headers,
|
|
894
|
+
include_footers,
|
|
895
|
+
strip_repeating_text,
|
|
896
|
+
include_watermarks,
|
|
897
|
+
};
|
|
898
|
+
|
|
899
|
+
Ok(config)
|
|
900
|
+
}
|
|
901
|
+
|
|
863
902
|
/// Parse ExtractionConfig from Ruby Hash
|
|
864
903
|
pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
|
|
865
904
|
let mut config = ExtractionConfig::default();
|
|
@@ -996,6 +1035,13 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
|
|
|
996
1035
|
config.email = Some(parse_email_config(ruby, email_hash)?);
|
|
997
1036
|
}
|
|
998
1037
|
|
|
1038
|
+
if let Some(val) = get_kw(ruby, hash, "content_filter")
|
|
1039
|
+
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
1040
|
+
{
|
|
1041
|
+
let content_filter_hash = RHash::try_convert(val)?;
|
|
1042
|
+
config.content_filter = Some(parse_content_filter_config(ruby, content_filter_hash)?);
|
|
1043
|
+
}
|
|
1044
|
+
|
|
999
1045
|
if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
|
|
1000
1046
|
let value = usize::try_convert(val)?;
|
|
1001
1047
|
config.max_concurrent_extractions = Some(value);
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
//! Standalone embedding functions for Ruby.
|
|
2
|
+
//!
|
|
3
|
+
//! Exposes `embed_sync` and `embed` module functions that generate vector embeddings
|
|
4
|
+
//! from a list of text strings using the configured ONNX model.
|
|
5
|
+
|
|
6
|
+
use crate::error_handling::{kreuzberg_error, runtime_error};
|
|
7
|
+
use crate::helpers::ruby_value_to_json;
|
|
8
|
+
use magnus::{Error, RArray, RHash, Ruby, TryConvert, Value, scan_args::get_kwargs, scan_args::scan_args};
|
|
9
|
+
use magnus::value::ReprValue;
|
|
10
|
+
|
|
11
|
+
/// Parse an optional Ruby value (Hash or nil) into a `kreuzberg::EmbeddingConfig`.
|
|
12
|
+
fn parse_embedding_config(ruby: &Ruby, config_val: Option<Value>) -> Result<kreuzberg::EmbeddingConfig, Error> {
|
|
13
|
+
match config_val {
|
|
14
|
+
None => Ok(Default::default()),
|
|
15
|
+
Some(val) => {
|
|
16
|
+
if val.equal(ruby.qnil())? {
|
|
17
|
+
return Ok(Default::default());
|
|
18
|
+
}
|
|
19
|
+
let json = ruby_value_to_json(val)?;
|
|
20
|
+
serde_json::from_value(json)
|
|
21
|
+
.map_err(|e| runtime_error(format!("Invalid embedding config: {}", e)))
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/// Convert `Vec<Vec<f32>>` to a Ruby Array of Arrays of Floats.
|
|
27
|
+
fn embeddings_to_ruby(ruby: &Ruby, embeddings: Vec<Vec<f32>>) -> Result<RArray, Error> {
|
|
28
|
+
let outer = ruby.ary_new_capa(embeddings.len());
|
|
29
|
+
for inner_vec in embeddings {
|
|
30
|
+
let inner = ruby.ary_new_capa(inner_vec.len());
|
|
31
|
+
for v in inner_vec {
|
|
32
|
+
inner.push(v as f64)?;
|
|
33
|
+
}
|
|
34
|
+
outer.push(inner)?;
|
|
35
|
+
}
|
|
36
|
+
Ok(outer)
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/// Parse keyword args common to `embed_sync` and `embed`.
|
|
40
|
+
/// Returns `(texts, config)`.
|
|
41
|
+
fn parse_embed_args(
|
|
42
|
+
ruby: &Ruby,
|
|
43
|
+
args: &[Value],
|
|
44
|
+
) -> Result<(Vec<String>, kreuzberg::EmbeddingConfig), Error> {
|
|
45
|
+
let parsed = scan_args::<(), (), (), (), RHash, ()>(args)?;
|
|
46
|
+
let kw = parsed.keywords;
|
|
47
|
+
|
|
48
|
+
let kw_args = get_kwargs::<_, (Value,), (Option<Value>,), ()>(kw, &["texts"], &["config"])?;
|
|
49
|
+
let (texts_val,) = kw_args.required;
|
|
50
|
+
let (config_opt,) = kw_args.optional;
|
|
51
|
+
|
|
52
|
+
let texts_arr = RArray::try_convert(texts_val)
|
|
53
|
+
.map_err(|_| runtime_error("texts must be an Array".to_string()))?;
|
|
54
|
+
let texts: Vec<String> = texts_arr
|
|
55
|
+
.into_iter()
|
|
56
|
+
.enumerate()
|
|
57
|
+
.map(|(i, v)| {
|
|
58
|
+
String::try_convert(v)
|
|
59
|
+
.map_err(|_| runtime_error(format!("texts[{}] must be a String", i)))
|
|
60
|
+
})
|
|
61
|
+
.collect::<Result<_, _>>()?;
|
|
62
|
+
|
|
63
|
+
let config = parse_embedding_config(ruby, config_opt)?;
|
|
64
|
+
Ok((texts, config))
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/// Generate embeddings synchronously.
|
|
68
|
+
///
|
|
69
|
+
/// Keyword args: `texts:` (Array of String), `config:` (Hash, optional)
|
|
70
|
+
/// Returns: Array of Arrays of Float (one per input text).
|
|
71
|
+
pub fn embed_sync(args: &[Value]) -> Result<RArray, Error> {
|
|
72
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
73
|
+
let (texts, config) = parse_embed_args(&ruby, args)?;
|
|
74
|
+
let embeddings = kreuzberg::embed_texts(&texts, &config).map_err(kreuzberg_error)?;
|
|
75
|
+
embeddings_to_ruby(&ruby, embeddings)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/// Generate embeddings (delegates to `embed_sync`).
|
|
79
|
+
///
|
|
80
|
+
/// Ruby's GVL prevents true async execution, so this simply delegates to
|
|
81
|
+
/// the synchronous implementation to avoid creating a throwaway Tokio runtime.
|
|
82
|
+
///
|
|
83
|
+
/// Keyword args: `texts:` (Array of String), `config:` (Hash, optional)
|
|
84
|
+
/// Returns: Array of Arrays of Float (one per input text).
|
|
85
|
+
pub fn embed(args: &[Value]) -> Result<RArray, Error> {
|
|
86
|
+
embed_sync(args)
|
|
87
|
+
}
|
|
@@ -101,6 +101,13 @@ pub fn kreuzberg_error(err: KreuzbergError) -> Error {
|
|
|
101
101
|
)
|
|
102
102
|
}
|
|
103
103
|
}
|
|
104
|
+
KreuzbergError::Embedding { message, .. } => {
|
|
105
|
+
if let Some(class) = fetch_error_class("EmbeddingError") {
|
|
106
|
+
Error::new(class, message)
|
|
107
|
+
} else {
|
|
108
|
+
Error::new(ruby.exception_runtime_error(), format!("EmbeddingError: {}", message))
|
|
109
|
+
}
|
|
110
|
+
}
|
|
104
111
|
other => Error::new(ruby.exception_runtime_error(), other.to_string()),
|
|
105
112
|
}
|
|
106
113
|
}
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
//! Provides extraction, OCR, chunking, and language detection for 30+ file formats.
|
|
7
7
|
|
|
8
8
|
// Module declarations
|
|
9
|
+
mod embedding;
|
|
9
10
|
mod error_handling;
|
|
10
11
|
mod gc_guarded_value;
|
|
11
12
|
mod helpers;
|
|
@@ -457,6 +458,10 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
457
458
|
module.define_module_function("batch_extract_files", function!(batch_extract_files, -1))?;
|
|
458
459
|
module.define_module_function("batch_extract_bytes", function!(batch_extract_bytes, -1))?;
|
|
459
460
|
|
|
461
|
+
// Embedding functions
|
|
462
|
+
module.define_module_function("embed_sync", function!(embedding::embed_sync, -1))?;
|
|
463
|
+
module.define_module_function("embed", function!(embedding::embed, -1))?;
|
|
464
|
+
|
|
460
465
|
// PDF page iterator
|
|
461
466
|
module.define_module_function("native_render_pdf_pages_iter", function!(render_pdf_pages_iter, 2))?;
|
|
462
467
|
module.define_module_function("native_render_pdf_page", function!(native_render_pdf_page, 3))?;
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -856,6 +856,40 @@ module Kreuzberg
|
|
|
856
856
|
end
|
|
857
857
|
end
|
|
858
858
|
|
|
859
|
+
# Content filter configuration for controlling extraction of headers, footers,
|
|
860
|
+
# watermarks, and repeating text across document formats.
|
|
861
|
+
#
|
|
862
|
+
# @example Include headers and footers
|
|
863
|
+
# filter = ContentFilter.new(include_headers: true, include_footers: true)
|
|
864
|
+
#
|
|
865
|
+
# @example Disable repeating text removal
|
|
866
|
+
# filter = ContentFilter.new(strip_repeating_text: false)
|
|
867
|
+
#
|
|
868
|
+
class ContentFilter
|
|
869
|
+
attr_reader :include_headers, :include_footers, :strip_repeating_text, :include_watermarks
|
|
870
|
+
|
|
871
|
+
def initialize(
|
|
872
|
+
include_headers: false,
|
|
873
|
+
include_footers: false,
|
|
874
|
+
strip_repeating_text: true,
|
|
875
|
+
include_watermarks: false
|
|
876
|
+
)
|
|
877
|
+
@include_headers = include_headers ? true : false
|
|
878
|
+
@include_footers = include_footers ? true : false
|
|
879
|
+
@strip_repeating_text = strip_repeating_text ? true : false
|
|
880
|
+
@include_watermarks = include_watermarks ? true : false
|
|
881
|
+
end
|
|
882
|
+
|
|
883
|
+
def to_h
|
|
884
|
+
{
|
|
885
|
+
include_headers: @include_headers,
|
|
886
|
+
include_footers: @include_footers,
|
|
887
|
+
strip_repeating_text: @strip_repeating_text,
|
|
888
|
+
include_watermarks: @include_watermarks
|
|
889
|
+
}
|
|
890
|
+
end
|
|
891
|
+
end
|
|
892
|
+
|
|
859
893
|
# Layout detection configuration
|
|
860
894
|
#
|
|
861
895
|
# @example Basic usage
|
|
@@ -951,7 +985,7 @@ module Kreuzberg
|
|
|
951
985
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
952
986
|
:security_limits, :layout, :concurrency,
|
|
953
987
|
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs,
|
|
954
|
-
:max_archive_depth, :acceleration, :email
|
|
988
|
+
:max_archive_depth, :acceleration, :email, :content_filter
|
|
955
989
|
|
|
956
990
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
957
991
|
alias image_extraction images
|
|
@@ -977,7 +1011,7 @@ module Kreuzberg
|
|
|
977
1011
|
postprocessor token_reduction keywords html_options pages
|
|
978
1012
|
max_concurrent_extractions output_format result_format
|
|
979
1013
|
security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
|
|
980
|
-
max_archive_depth acceleration email
|
|
1014
|
+
max_archive_depth acceleration email content_filter
|
|
981
1015
|
].freeze
|
|
982
1016
|
|
|
983
1017
|
# Aliases for backward compatibility
|
|
@@ -1062,7 +1096,8 @@ module Kreuzberg
|
|
|
1062
1096
|
extraction_timeout_secs: nil,
|
|
1063
1097
|
max_archive_depth: 3,
|
|
1064
1098
|
acceleration: nil,
|
|
1065
|
-
email: nil
|
|
1099
|
+
email: nil,
|
|
1100
|
+
content_filter: nil)
|
|
1066
1101
|
kwargs = {
|
|
1067
1102
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1068
1103
|
force_ocr: force_ocr, disable_ocr: disable_ocr, force_ocr_pages: force_ocr_pages,
|
|
@@ -1080,7 +1115,8 @@ module Kreuzberg
|
|
|
1080
1115
|
extraction_timeout_secs: extraction_timeout_secs,
|
|
1081
1116
|
max_archive_depth: max_archive_depth,
|
|
1082
1117
|
acceleration: acceleration,
|
|
1083
|
-
email: email
|
|
1118
|
+
email: email,
|
|
1119
|
+
content_filter: content_filter
|
|
1084
1120
|
}
|
|
1085
1121
|
extracted = extract_from_hash(hash, kwargs)
|
|
1086
1122
|
|
|
@@ -1115,6 +1151,7 @@ module Kreuzberg
|
|
|
1115
1151
|
@concurrency = normalize_config(params[:concurrency], Concurrency)
|
|
1116
1152
|
@acceleration = normalize_config(params[:acceleration], Acceleration)
|
|
1117
1153
|
@email = normalize_config(params[:email], Email)
|
|
1154
|
+
@content_filter = normalize_config(params[:content_filter], ContentFilter)
|
|
1118
1155
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
1119
1156
|
@max_archive_depth = params[:max_archive_depth]&.to_i || 3
|
|
1120
1157
|
@output_format = validate_output_format(params[:output_format])
|
|
@@ -1175,7 +1212,8 @@ module Kreuzberg
|
|
|
1175
1212
|
token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
|
|
1176
1213
|
html_options: @html_options&.to_h, pages: @pages&.to_h,
|
|
1177
1214
|
layout: @layout&.to_h, concurrency: @concurrency&.to_h,
|
|
1178
|
-
acceleration: @acceleration&.to_h, email: @email&.to_h
|
|
1215
|
+
acceleration: @acceleration&.to_h, email: @email&.to_h,
|
|
1216
|
+
content_filter: @content_filter&.to_h
|
|
1179
1217
|
}
|
|
1180
1218
|
end
|
|
1181
1219
|
|
data/lib/kreuzberg/errors.rb
CHANGED
|
@@ -11,6 +11,7 @@ module Kreuzberg
|
|
|
11
11
|
ERROR_CODE_PARSING = 5
|
|
12
12
|
ERROR_CODE_OCR = 6
|
|
13
13
|
ERROR_CODE_MISSING_DEPENDENCY = 7
|
|
14
|
+
ERROR_CODE_EMBEDDING = 8
|
|
14
15
|
|
|
15
16
|
module Errors
|
|
16
17
|
class PanicContext
|
|
@@ -112,5 +113,8 @@ module Kreuzberg
|
|
|
112
113
|
|
|
113
114
|
# Raised when an unsupported file format or MIME type is encountered
|
|
114
115
|
class UnsupportedFormatError < Error; end
|
|
116
|
+
|
|
117
|
+
# Raised when embedding fails
|
|
118
|
+
class EmbeddingError < Error; end
|
|
115
119
|
end
|
|
116
120
|
end
|
|
@@ -236,6 +236,41 @@ module Kreuzberg
|
|
|
236
236
|
results
|
|
237
237
|
end
|
|
238
238
|
|
|
239
|
+
# Asynchronously generate embeddings for multiple texts.
|
|
240
|
+
#
|
|
241
|
+
# Non-blocking embedding generation from a list of strings.
|
|
242
|
+
#
|
|
243
|
+
# @param texts [Array<String>] List of strings to embed.
|
|
244
|
+
# @param config [Config::Embedding, Hash, nil] Embedding configuration.
|
|
245
|
+
#
|
|
246
|
+
# @return [Array<Array<Float>>] Array of embedding vectors.
|
|
247
|
+
#
|
|
248
|
+
# @raise [Errors::EmbeddingError] If embedding generation fails.
|
|
249
|
+
#
|
|
250
|
+
# @example Generate embeddings asynchronously
|
|
251
|
+
# texts = ["Hello, world!", "Kreuzberg is awesome."]
|
|
252
|
+
# embeddings = Kreuzberg.embed(texts: texts)
|
|
253
|
+
# puts embeddings.first.length # 384
|
|
254
|
+
def embed(texts:, config: nil)
|
|
255
|
+
opts = normalize_config(config)
|
|
256
|
+
native_embed(texts: texts.map(&:to_s), config: opts)
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Synchronously generate embeddings for multiple texts.
|
|
260
|
+
#
|
|
261
|
+
# Blocking embedding generation from a list of strings.
|
|
262
|
+
#
|
|
263
|
+
# @param texts [Array<String>] List of strings to embed.
|
|
264
|
+
# @param config [Config::Embedding, Hash, nil] Embedding configuration.
|
|
265
|
+
#
|
|
266
|
+
# @return [Array<Array<Float>>] Array of embedding vectors.
|
|
267
|
+
#
|
|
268
|
+
# @raise [Errors::EmbeddingError] If embedding generation fails.
|
|
269
|
+
def embed_sync(texts:, config: nil)
|
|
270
|
+
opts = normalize_config(config)
|
|
271
|
+
native_embed_sync(texts: texts.map(&:to_s), config: opts)
|
|
272
|
+
end
|
|
273
|
+
|
|
239
274
|
# Synchronously extract content from multiple byte data sources.
|
|
240
275
|
#
|
|
241
276
|
# Processes multiple in-memory binary documents in a single batch operation. Results
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -59,10 +59,13 @@ module Kreuzberg
|
|
|
59
59
|
alias native_batch_extract_bytes batch_extract_bytes
|
|
60
60
|
alias native_clear_cache clear_cache
|
|
61
61
|
alias native_cache_stats cache_stats
|
|
62
|
+
alias native_embed_sync embed_sync
|
|
63
|
+
alias native_embed embed
|
|
62
64
|
|
|
63
65
|
private :native_extract_file_sync, :native_extract_bytes_sync, :native_batch_extract_files_sync
|
|
64
66
|
private :native_extract_file, :native_extract_bytes, :native_batch_extract_files
|
|
65
67
|
private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
|
|
68
|
+
private :native_embed_sync, :native_embed
|
|
66
69
|
end
|
|
67
70
|
|
|
68
71
|
module_function :register_post_processor
|
|
@@ -94,6 +97,10 @@ module Kreuzberg
|
|
|
94
97
|
module_function :validate_mime_type
|
|
95
98
|
|
|
96
99
|
module_function :get_extensions_for_mime
|
|
100
|
+
|
|
101
|
+
module_function :embed_sync
|
|
102
|
+
|
|
103
|
+
module_function :embed
|
|
97
104
|
end
|
|
98
105
|
|
|
99
106
|
require_relative 'kreuzberg/cache_api'
|