kreuzberg 4.6.2 → 4.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +116 -14
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/config.rb +46 -5
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +21 -0
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +8 -6
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/handlers.rs +27 -3
- data/vendor/kreuzberg/src/api/router.rs +13 -1
- data/vendor/kreuzberg/src/api/types.rs +11 -3
- data/vendor/kreuzberg/src/core/config/merge.rs +149 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -5
- data/vendor/kreuzberg/src/core/extractor/file.rs +11 -63
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +17 -1
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +40 -1
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +1 -0
- data/vendor/kreuzberg/src/extraction/docx/drawing.rs +52 -1
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -28
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -7
- data/vendor/kreuzberg/src/extractors/citation.rs +0 -7
- data/vendor/kreuzberg/src/extractors/docx.rs +0 -7
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/epub/content.rs +261 -130
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +175 -65
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +147 -64
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +76 -9
- data/vendor/kreuzberg/src/extractors/excel.rs +0 -13
- data/vendor/kreuzberg/src/extractors/html.rs +0 -14
- data/vendor/kreuzberg/src/extractors/image.rs +0 -7
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +0 -7
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -7
- data/vendor/kreuzberg/src/extractors/mdx.rs +0 -7
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +11 -7
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +9 -9
- data/vendor/kreuzberg/src/extractors/pptx.rs +0 -13
- data/vendor/kreuzberg/src/extractors/pst.rs +0 -13
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +0 -7
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -13
- data/vendor/kreuzberg/src/extractors/text.rs +0 -7
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -7
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/layout/models/rtdetr.rs +26 -0
- data/vendor/kreuzberg/src/lib.rs +4 -0
- data/vendor/kreuzberg/src/mcp/format.rs +2 -58
- data/vendor/kreuzberg/src/mcp/server.rs +48 -13
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +46 -18
- data/vendor/kreuzberg/src/plugins/extractor/instrumented.rs +178 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -1
- data/vendor/kreuzberg/src/service/extraction.rs +118 -0
- data/vendor/kreuzberg/src/service/layers/metrics.rs +87 -0
- data/vendor/kreuzberg/src/service/layers/mod.rs +6 -0
- data/vendor/kreuzberg/src/service/layers/tracing.rs +105 -0
- data/vendor/kreuzberg/src/service/mod.rs +254 -0
- data/vendor/kreuzberg/src/service/request.rs +117 -0
- data/vendor/kreuzberg/src/telemetry/conventions.rs +231 -0
- data/vendor/kreuzberg/src/telemetry/metrics.rs +113 -0
- data/vendor/kreuzberg/src/telemetry/mod.rs +20 -0
- data/vendor/kreuzberg/src/telemetry/spans.rs +79 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -1
- data/vendor/kreuzberg/tests/epub_spine_semantics_tests.rs +727 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +15 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 26e800012598eeb04fe01d85d1ff8df63ac8a02a7c6345a4632bb2aae2981300
|
|
4
|
+
data.tar.gz: 6207e53529cbde80bbacd4db9e9e0bd6f6640242e16e1a1cb1543ccfc8ab0291
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 05e510db77e1154b51918b2804549a08a32ee0091bf17ef9f22580391d7eb03044e7e220f398d4f647746b79ec163258ef2862be577662cb542d92a684f37f07
|
|
7
|
+
data.tar.gz: eff1441ce70ff97dec6cbf1ee995aa60fb91a072fee5df9475c6e02072e59aa151fc4a797ddcdebed80c3b53c90f3f8a274392bc8e39c7425cc8fc25d77b101a
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.3" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -61,6 +61,21 @@ dependencies = [
|
|
|
61
61
|
"equator",
|
|
62
62
|
]
|
|
63
63
|
|
|
64
|
+
[[package]]
|
|
65
|
+
name = "alloc-no-stdlib"
|
|
66
|
+
version = "2.0.4"
|
|
67
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
68
|
+
checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
|
|
69
|
+
|
|
70
|
+
[[package]]
|
|
71
|
+
name = "alloc-stdlib"
|
|
72
|
+
version = "0.2.2"
|
|
73
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
74
|
+
checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
|
|
75
|
+
dependencies = [
|
|
76
|
+
"alloc-no-stdlib",
|
|
77
|
+
]
|
|
78
|
+
|
|
64
79
|
[[package]]
|
|
65
80
|
name = "allocator-api2"
|
|
66
81
|
version = "0.2.21"
|
|
@@ -188,6 +203,18 @@ dependencies = [
|
|
|
188
203
|
"memchr",
|
|
189
204
|
]
|
|
190
205
|
|
|
206
|
+
[[package]]
|
|
207
|
+
name = "async-compression"
|
|
208
|
+
version = "0.4.41"
|
|
209
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
210
|
+
checksum = "d0f9ee0f6e02ffd7ad5816e9464499fba7b3effd01123b515c41d1697c43dad1"
|
|
211
|
+
dependencies = [
|
|
212
|
+
"compression-codecs",
|
|
213
|
+
"compression-core",
|
|
214
|
+
"pin-project-lite",
|
|
215
|
+
"tokio",
|
|
216
|
+
]
|
|
217
|
+
|
|
191
218
|
[[package]]
|
|
192
219
|
name = "async-trait"
|
|
193
220
|
version = "0.1.89"
|
|
@@ -517,6 +544,27 @@ dependencies = [
|
|
|
517
544
|
"generic-array",
|
|
518
545
|
]
|
|
519
546
|
|
|
547
|
+
[[package]]
|
|
548
|
+
name = "brotli"
|
|
549
|
+
version = "8.0.2"
|
|
550
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
551
|
+
checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560"
|
|
552
|
+
dependencies = [
|
|
553
|
+
"alloc-no-stdlib",
|
|
554
|
+
"alloc-stdlib",
|
|
555
|
+
"brotli-decompressor",
|
|
556
|
+
]
|
|
557
|
+
|
|
558
|
+
[[package]]
|
|
559
|
+
name = "brotli-decompressor"
|
|
560
|
+
version = "5.0.0"
|
|
561
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
562
|
+
checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
|
|
563
|
+
dependencies = [
|
|
564
|
+
"alloc-no-stdlib",
|
|
565
|
+
"alloc-stdlib",
|
|
566
|
+
]
|
|
567
|
+
|
|
520
568
|
[[package]]
|
|
521
569
|
name = "bufrw"
|
|
522
570
|
version = "0.2.0"
|
|
@@ -788,9 +836,9 @@ dependencies = [
|
|
|
788
836
|
|
|
789
837
|
[[package]]
|
|
790
838
|
name = "cmake"
|
|
791
|
-
version = "0.1.
|
|
839
|
+
version = "0.1.58"
|
|
792
840
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
793
|
-
checksum = "
|
|
841
|
+
checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
|
|
794
842
|
dependencies = [
|
|
795
843
|
"cc",
|
|
796
844
|
]
|
|
@@ -841,6 +889,26 @@ dependencies = [
|
|
|
841
889
|
"static_assertions",
|
|
842
890
|
]
|
|
843
891
|
|
|
892
|
+
[[package]]
|
|
893
|
+
name = "compression-codecs"
|
|
894
|
+
version = "0.4.37"
|
|
895
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
896
|
+
checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7"
|
|
897
|
+
dependencies = [
|
|
898
|
+
"brotli",
|
|
899
|
+
"compression-core",
|
|
900
|
+
"flate2",
|
|
901
|
+
"memchr",
|
|
902
|
+
"zstd",
|
|
903
|
+
"zstd-safe",
|
|
904
|
+
]
|
|
905
|
+
|
|
906
|
+
[[package]]
|
|
907
|
+
name = "compression-core"
|
|
908
|
+
version = "0.4.31"
|
|
909
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
910
|
+
checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
|
|
911
|
+
|
|
844
912
|
[[package]]
|
|
845
913
|
name = "console"
|
|
846
914
|
version = "0.15.11"
|
|
@@ -4301,9 +4369,9 @@ dependencies = [
|
|
|
4301
4369
|
|
|
4302
4370
|
[[package]]
|
|
4303
4371
|
name = "rmcp"
|
|
4304
|
-
version = "1.
|
|
4372
|
+
version = "1.3.0"
|
|
4305
4373
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4306
|
-
checksum = "
|
|
4374
|
+
checksum = "2231b2c085b371c01bc90c0e6c1cab8834711b6394533375bdbf870b0166d419"
|
|
4307
4375
|
dependencies = [
|
|
4308
4376
|
"async-trait",
|
|
4309
4377
|
"base64 0.22.1",
|
|
@@ -4332,9 +4400,9 @@ dependencies = [
|
|
|
4332
4400
|
|
|
4333
4401
|
[[package]]
|
|
4334
4402
|
name = "rmcp-macros"
|
|
4335
|
-
version = "1.
|
|
4403
|
+
version = "1.3.0"
|
|
4336
4404
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4337
|
-
checksum = "
|
|
4405
|
+
checksum = "36ea0e100fadf81be85d7ff70f86cd805c7572601d4ab2946207f36540854b43"
|
|
4338
4406
|
dependencies = [
|
|
4339
4407
|
"darling 0.23.0",
|
|
4340
4408
|
"proc-macro2",
|
|
@@ -4782,9 +4850,9 @@ dependencies = [
|
|
|
4782
4850
|
|
|
4783
4851
|
[[package]]
|
|
4784
4852
|
name = "simd-adler32"
|
|
4785
|
-
version = "0.3.
|
|
4853
|
+
version = "0.3.9"
|
|
4786
4854
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4787
|
-
checksum = "
|
|
4855
|
+
checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
|
|
4788
4856
|
|
|
4789
4857
|
[[package]]
|
|
4790
4858
|
name = "simd_helpers"
|
|
@@ -5359,6 +5427,7 @@ dependencies = [
|
|
|
5359
5427
|
"pin-project-lite",
|
|
5360
5428
|
"sync_wrapper",
|
|
5361
5429
|
"tokio",
|
|
5430
|
+
"tokio-util",
|
|
5362
5431
|
"tower-layer",
|
|
5363
5432
|
"tower-service",
|
|
5364
5433
|
"tracing",
|
|
@@ -5370,18 +5439,23 @@ version = "0.6.8"
|
|
|
5370
5439
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5371
5440
|
checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
|
|
5372
5441
|
dependencies = [
|
|
5442
|
+
"async-compression",
|
|
5373
5443
|
"bitflags",
|
|
5374
5444
|
"bytes",
|
|
5445
|
+
"futures-core",
|
|
5375
5446
|
"futures-util",
|
|
5376
5447
|
"http",
|
|
5377
5448
|
"http-body",
|
|
5378
5449
|
"http-body-util",
|
|
5379
5450
|
"iri-string",
|
|
5380
5451
|
"pin-project-lite",
|
|
5452
|
+
"tokio",
|
|
5453
|
+
"tokio-util",
|
|
5381
5454
|
"tower",
|
|
5382
5455
|
"tower-layer",
|
|
5383
5456
|
"tower-service",
|
|
5384
5457
|
"tracing",
|
|
5458
|
+
"uuid",
|
|
5385
5459
|
]
|
|
5386
5460
|
|
|
5387
5461
|
[[package]]
|
|
@@ -5535,9 +5609,9 @@ checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
|
|
|
5535
5609
|
|
|
5536
5610
|
[[package]]
|
|
5537
5611
|
name = "unicode-segmentation"
|
|
5538
|
-
version = "1.13.
|
|
5612
|
+
version = "1.13.2"
|
|
5539
5613
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5540
|
-
checksum = "
|
|
5614
|
+
checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c"
|
|
5541
5615
|
|
|
5542
5616
|
[[package]]
|
|
5543
5617
|
name = "unicode-width"
|
|
@@ -5713,9 +5787,9 @@ dependencies = [
|
|
|
5713
5787
|
|
|
5714
5788
|
[[package]]
|
|
5715
5789
|
name = "uuid"
|
|
5716
|
-
version = "1.
|
|
5790
|
+
version = "1.23.0"
|
|
5717
5791
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5718
|
-
checksum = "
|
|
5792
|
+
checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9"
|
|
5719
5793
|
dependencies = [
|
|
5720
5794
|
"getrandom 0.4.2",
|
|
5721
5795
|
"js-sys",
|
|
@@ -6595,6 +6669,34 @@ dependencies = [
|
|
|
6595
6669
|
"simd-adler32",
|
|
6596
6670
|
]
|
|
6597
6671
|
|
|
6672
|
+
[[package]]
|
|
6673
|
+
name = "zstd"
|
|
6674
|
+
version = "0.13.3"
|
|
6675
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6676
|
+
checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
|
|
6677
|
+
dependencies = [
|
|
6678
|
+
"zstd-safe",
|
|
6679
|
+
]
|
|
6680
|
+
|
|
6681
|
+
[[package]]
|
|
6682
|
+
name = "zstd-safe"
|
|
6683
|
+
version = "7.2.4"
|
|
6684
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6685
|
+
checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
|
|
6686
|
+
dependencies = [
|
|
6687
|
+
"zstd-sys",
|
|
6688
|
+
]
|
|
6689
|
+
|
|
6690
|
+
[[package]]
|
|
6691
|
+
name = "zstd-sys"
|
|
6692
|
+
version = "2.0.16+zstd.1.5.7"
|
|
6693
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6694
|
+
checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
|
|
6695
|
+
dependencies = [
|
|
6696
|
+
"cc",
|
|
6697
|
+
"pkg-config",
|
|
6698
|
+
]
|
|
6699
|
+
|
|
6598
6700
|
[[package]]
|
|
6599
6701
|
name = "zune-core"
|
|
6600
6702
|
version = "0.5.1"
|
|
@@ -6612,9 +6714,9 @@ dependencies = [
|
|
|
6612
6714
|
|
|
6613
6715
|
[[package]]
|
|
6614
6716
|
name = "zune-jpeg"
|
|
6615
|
-
version = "0.5.
|
|
6717
|
+
version = "0.5.15"
|
|
6616
6718
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6617
|
-
checksum = "
|
|
6719
|
+
checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296"
|
|
6618
6720
|
dependencies = [
|
|
6619
6721
|
"zune-core",
|
|
6620
6722
|
]
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -837,6 +837,25 @@ module Kreuzberg
|
|
|
837
837
|
end
|
|
838
838
|
end
|
|
839
839
|
|
|
840
|
+
# Email extraction configuration
|
|
841
|
+
#
|
|
842
|
+
# @example With fallback codepage
|
|
843
|
+
# email = Email.new(msg_fallback_codepage: 1251)
|
|
844
|
+
#
|
|
845
|
+
class Email
|
|
846
|
+
attr_reader :msg_fallback_codepage
|
|
847
|
+
|
|
848
|
+
def initialize(msg_fallback_codepage: nil)
|
|
849
|
+
@msg_fallback_codepage = msg_fallback_codepage&.to_i
|
|
850
|
+
end
|
|
851
|
+
|
|
852
|
+
def to_h
|
|
853
|
+
h = {}
|
|
854
|
+
h[:msg_fallback_codepage] = @msg_fallback_codepage unless @msg_fallback_codepage.nil?
|
|
855
|
+
h
|
|
856
|
+
end
|
|
857
|
+
end
|
|
858
|
+
|
|
840
859
|
# Layout detection configuration
|
|
841
860
|
#
|
|
842
861
|
# @example Basic usage with fast preset
|
|
@@ -933,7 +952,8 @@ module Kreuzberg
|
|
|
933
952
|
:token_reduction, :keywords, :html_options, :pages,
|
|
934
953
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
935
954
|
:security_limits, :layout, :concurrency,
|
|
936
|
-
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
|
|
955
|
+
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs,
|
|
956
|
+
:max_archive_depth, :acceleration, :email
|
|
937
957
|
|
|
938
958
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
939
959
|
alias image_extraction images
|
|
@@ -959,6 +979,7 @@ module Kreuzberg
|
|
|
959
979
|
postprocessor token_reduction keywords html_options pages
|
|
960
980
|
max_concurrent_extractions output_format result_format
|
|
961
981
|
security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
|
|
982
|
+
max_archive_depth acceleration email
|
|
962
983
|
].freeze
|
|
963
984
|
|
|
964
985
|
# Aliases for backward compatibility
|
|
@@ -1015,7 +1036,7 @@ module Kreuzberg
|
|
|
1015
1036
|
new(**normalize_hash_keys(hash))
|
|
1016
1037
|
end
|
|
1017
1038
|
|
|
1018
|
-
def initialize(hash = nil,
|
|
1039
|
+
def initialize(hash = nil, # rubocop:disable Metrics/MethodLength
|
|
1019
1040
|
use_cache: true,
|
|
1020
1041
|
enable_quality_processing: true,
|
|
1021
1042
|
force_ocr: false,
|
|
@@ -1039,7 +1060,10 @@ module Kreuzberg
|
|
|
1039
1060
|
concurrency: nil,
|
|
1040
1061
|
cache_namespace: nil,
|
|
1041
1062
|
cache_ttl_secs: nil,
|
|
1042
|
-
extraction_timeout_secs: nil
|
|
1063
|
+
extraction_timeout_secs: nil,
|
|
1064
|
+
max_archive_depth: 3,
|
|
1065
|
+
acceleration: nil,
|
|
1066
|
+
email: nil)
|
|
1043
1067
|
kwargs = {
|
|
1044
1068
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1045
1069
|
force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
|
|
@@ -1054,7 +1078,10 @@ module Kreuzberg
|
|
|
1054
1078
|
concurrency: concurrency,
|
|
1055
1079
|
cache_namespace: cache_namespace,
|
|
1056
1080
|
cache_ttl_secs: cache_ttl_secs,
|
|
1057
|
-
extraction_timeout_secs: extraction_timeout_secs
|
|
1081
|
+
extraction_timeout_secs: extraction_timeout_secs,
|
|
1082
|
+
max_archive_depth: max_archive_depth,
|
|
1083
|
+
acceleration: acceleration,
|
|
1084
|
+
email: email
|
|
1058
1085
|
}
|
|
1059
1086
|
extracted = extract_from_hash(hash, kwargs)
|
|
1060
1087
|
|
|
@@ -1086,7 +1113,10 @@ module Kreuzberg
|
|
|
1086
1113
|
@pages = normalize_config(params[:pages], PageConfig)
|
|
1087
1114
|
@layout = normalize_config(params[:layout], LayoutDetection)
|
|
1088
1115
|
@concurrency = normalize_config(params[:concurrency], Concurrency)
|
|
1116
|
+
@acceleration = normalize_config(params[:acceleration], Acceleration)
|
|
1117
|
+
@email = normalize_config(params[:email], Email)
|
|
1089
1118
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
1119
|
+
@max_archive_depth = params[:max_archive_depth]&.to_i || 3
|
|
1090
1120
|
@output_format = validate_output_format(params[:output_format])
|
|
1091
1121
|
@result_format = validate_result_format(params[:result_format])
|
|
1092
1122
|
@cache_namespace = params[:cache_namespace]
|
|
@@ -1127,6 +1157,7 @@ module Kreuzberg
|
|
|
1127
1157
|
force_ocr_pages: @force_ocr_pages,
|
|
1128
1158
|
include_document_structure: @include_document_structure,
|
|
1129
1159
|
max_concurrent_extractions: @max_concurrent_extractions,
|
|
1160
|
+
max_archive_depth: @max_archive_depth,
|
|
1130
1161
|
output_format: @output_format,
|
|
1131
1162
|
result_format: @result_format,
|
|
1132
1163
|
cache_namespace: @cache_namespace,
|
|
@@ -1142,7 +1173,8 @@ module Kreuzberg
|
|
|
1142
1173
|
image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
|
|
1143
1174
|
token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
|
|
1144
1175
|
html_options: @html_options&.to_h, pages: @pages&.to_h,
|
|
1145
|
-
layout: @layout&.to_h, concurrency: @concurrency&.to_h
|
|
1176
|
+
layout: @layout&.to_h, concurrency: @concurrency&.to_h,
|
|
1177
|
+
acceleration: @acceleration&.to_h, email: @email&.to_h
|
|
1146
1178
|
}
|
|
1147
1179
|
end
|
|
1148
1180
|
|
|
@@ -1286,6 +1318,12 @@ module Kreuzberg
|
|
|
1286
1318
|
@layout = normalize_config(value, LayoutDetection)
|
|
1287
1319
|
when :concurrency
|
|
1288
1320
|
@concurrency = normalize_config(value, Concurrency)
|
|
1321
|
+
when :acceleration
|
|
1322
|
+
@acceleration = normalize_config(value, Acceleration)
|
|
1323
|
+
when :email
|
|
1324
|
+
@email = normalize_config(value, Email)
|
|
1325
|
+
when :max_archive_depth
|
|
1326
|
+
@max_archive_depth = value&.to_i || 3
|
|
1289
1327
|
when :max_concurrent_extractions
|
|
1290
1328
|
@max_concurrent_extractions = value&.to_i
|
|
1291
1329
|
when :output_format
|
|
@@ -1373,6 +1411,9 @@ module Kreuzberg
|
|
|
1373
1411
|
@html_options = merged.html_options
|
|
1374
1412
|
@pages = merged.pages
|
|
1375
1413
|
@layout = merged.layout
|
|
1414
|
+
@acceleration = merged.acceleration
|
|
1415
|
+
@email = merged.email
|
|
1416
|
+
@max_archive_depth = merged.max_archive_depth
|
|
1376
1417
|
end
|
|
1377
1418
|
|
|
1378
1419
|
def update_output_options(merged)
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -459,6 +459,21 @@ module Kreuzberg
|
|
|
459
459
|
def to_h: () -> Hash[Symbol, untyped]
|
|
460
460
|
end
|
|
461
461
|
|
|
462
|
+
class Acceleration
|
|
463
|
+
attr_reader provider: String
|
|
464
|
+
attr_reader device_id: Integer
|
|
465
|
+
|
|
466
|
+
def initialize: (?provider: String, ?device_id: Integer) -> void
|
|
467
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
class Email
|
|
471
|
+
attr_reader msg_fallback_codepage: Integer?
|
|
472
|
+
|
|
473
|
+
def initialize: (?msg_fallback_codepage: Integer?) -> void
|
|
474
|
+
def to_h: () -> Hash[Symbol, untyped]
|
|
475
|
+
end
|
|
476
|
+
|
|
462
477
|
class LayoutDetection
|
|
463
478
|
attr_reader preset: String
|
|
464
479
|
attr_reader confidence_threshold: Float?
|
|
@@ -497,7 +512,10 @@ module Kreuzberg
|
|
|
497
512
|
attr_reader pages: PageConfig?
|
|
498
513
|
attr_reader layout: LayoutDetection?
|
|
499
514
|
attr_reader concurrency: Concurrency?
|
|
515
|
+
attr_reader acceleration: Acceleration?
|
|
516
|
+
attr_reader email: Email?
|
|
500
517
|
attr_reader max_concurrent_extractions: Integer?
|
|
518
|
+
attr_reader max_archive_depth: Integer
|
|
501
519
|
attr_reader output_format: String?
|
|
502
520
|
attr_reader result_format: String?
|
|
503
521
|
attr_reader security_limits: Hash[String, Integer]?
|
|
@@ -524,7 +542,10 @@ module Kreuzberg
|
|
|
524
542
|
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
525
543
|
?layout: (LayoutDetection | Hash[Symbol, untyped])?,
|
|
526
544
|
?concurrency: (Concurrency | Hash[Symbol, untyped])?,
|
|
545
|
+
?acceleration: (Acceleration | Hash[Symbol, untyped])?,
|
|
546
|
+
?email: (Email | Hash[Symbol, untyped])?,
|
|
527
547
|
?max_concurrent_extractions: Integer?,
|
|
548
|
+
?max_archive_depth: Integer,
|
|
528
549
|
?output_format: String?,
|
|
529
550
|
?result_format: String?,
|
|
530
551
|
?cache_namespace: String?,
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.6.
|
|
5
|
+
version = "4.6.3"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -30,8 +30,8 @@ html-to-markdown-rs = { version = "2.29.0", default-features = false }
|
|
|
30
30
|
image = { version = "0.25.10", default-features = false }
|
|
31
31
|
itertools = "0.14"
|
|
32
32
|
js-sys = "0.3"
|
|
33
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.6.
|
|
34
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.
|
|
33
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.6.3", default-features = false }
|
|
34
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.3" }
|
|
35
35
|
lazy_static = "1.5.0"
|
|
36
36
|
libc = "0.2.183"
|
|
37
37
|
log = "0.4"
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.6.
|
|
3
|
+
version = "4.6.3"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -135,15 +135,17 @@ keywords-yake = ["dep:yake-rust", "stopwords"]
|
|
|
135
135
|
keywords-rake = ["dep:rake", "stopwords"]
|
|
136
136
|
keywords = ["keywords-yake", "keywords-rake"]
|
|
137
137
|
|
|
138
|
+
tower-service = ["dep:tower", "tokio-runtime"]
|
|
139
|
+
|
|
138
140
|
api = [
|
|
141
|
+
"tower-service",
|
|
139
142
|
"dep:axum",
|
|
140
|
-
"dep:tower",
|
|
141
143
|
"dep:tower-http",
|
|
142
144
|
"dep:utoipa",
|
|
143
145
|
"tokio-runtime",
|
|
144
146
|
"chunking",
|
|
145
147
|
]
|
|
146
|
-
mcp = ["dep:rmcp", "tokio-runtime"]
|
|
148
|
+
mcp = ["tower-service", "dep:rmcp", "tokio-runtime"]
|
|
147
149
|
mcp-http = ["mcp", "api"]
|
|
148
150
|
|
|
149
151
|
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
|
|
@@ -298,7 +300,7 @@ quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
|
|
298
300
|
rake = { version = "0.3.6", optional = true }
|
|
299
301
|
rayon = "1.11.0"
|
|
300
302
|
regex = "1.12.3"
|
|
301
|
-
rmcp = { version = "1.
|
|
303
|
+
rmcp = { version = "1.3.0", features = [
|
|
302
304
|
"server",
|
|
303
305
|
"macros",
|
|
304
306
|
"base64",
|
|
@@ -327,8 +329,8 @@ tokenizers = { version = "0.22", optional = true, default-features = false, feat
|
|
|
327
329
|
] }
|
|
328
330
|
tokio = { version = "1.50.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
|
|
329
331
|
toml = "1.1.0"
|
|
330
|
-
tower = { version = "0.5", optional = true }
|
|
331
|
-
tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
|
|
332
|
+
tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
|
|
333
|
+
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
|
|
332
334
|
tracing = "0.1"
|
|
333
335
|
tracing-opentelemetry = { version = "0.32", optional = true }
|
|
334
336
|
unicode-normalization = { version = "0.1.25", optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.6.
|
|
21
|
+
> **🚀 Version 4.6.3 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
use axum::{Json, extract::State};
|
|
4
4
|
|
|
5
|
-
use
|
|
5
|
+
use tower::Service;
|
|
6
|
+
|
|
7
|
+
use crate::{batch_extract_bytes, cache, service::ExtractionRequest};
|
|
6
8
|
|
|
7
9
|
use super::{
|
|
8
10
|
error::{ApiError, JsonApi, MultipartApi},
|
|
@@ -201,7 +203,13 @@ pub async fn extract_handler(
|
|
|
201
203
|
.into_iter()
|
|
202
204
|
.next()
|
|
203
205
|
.expect("files.len() == 1 guarantees one element exists");
|
|
204
|
-
let
|
|
206
|
+
let request = ExtractionRequest::bytes(data, mime_type, final_config.clone());
|
|
207
|
+
let mut svc = state
|
|
208
|
+
.extraction_service
|
|
209
|
+
.lock()
|
|
210
|
+
.expect("extraction service lock poisoned")
|
|
211
|
+
.clone();
|
|
212
|
+
let result = svc.call(request).await?;
|
|
205
213
|
return Ok(Json(vec![result]));
|
|
206
214
|
}
|
|
207
215
|
|
|
@@ -210,7 +218,21 @@ pub async fn extract_handler(
|
|
|
210
218
|
.map(|(data, mime, _name)| (data, mime, None))
|
|
211
219
|
.collect();
|
|
212
220
|
|
|
213
|
-
|
|
221
|
+
#[cfg(feature = "otel")]
|
|
222
|
+
let batch_span = tracing::info_span!(
|
|
223
|
+
"kreuzberg.service",
|
|
224
|
+
{ crate::telemetry::conventions::OPERATION } = crate::telemetry::conventions::operations::BATCH_EXTRACT,
|
|
225
|
+
{ crate::telemetry::conventions::BATCH_SIZE } = files_data.len(),
|
|
226
|
+
);
|
|
227
|
+
#[cfg(not(feature = "otel"))]
|
|
228
|
+
let batch_span = tracing::Span::none();
|
|
229
|
+
|
|
230
|
+
let results = {
|
|
231
|
+
use tracing::Instrument;
|
|
232
|
+
batch_extract_bytes(files_data, final_config)
|
|
233
|
+
.instrument(batch_span)
|
|
234
|
+
.await?
|
|
235
|
+
};
|
|
214
236
|
Ok(Json(results))
|
|
215
237
|
}
|
|
216
238
|
|
|
@@ -878,8 +900,10 @@ mod tests {
|
|
|
878
900
|
use tower::ServiceExt;
|
|
879
901
|
|
|
880
902
|
fn test_router() -> Router {
|
|
903
|
+
let extraction_service = crate::service::ExtractionServiceBuilder::new().build();
|
|
881
904
|
let state = ApiState {
|
|
882
905
|
default_config: std::sync::Arc::new(crate::ExtractionConfig::default()),
|
|
906
|
+
extraction_service: std::sync::Arc::new(std::sync::Mutex::new(extraction_service)),
|
|
883
907
|
};
|
|
884
908
|
Router::new()
|
|
885
909
|
.route("/version", get(version_handler))
|
|
@@ -8,12 +8,16 @@ use axum::{
|
|
|
8
8
|
routing::{delete, get, post},
|
|
9
9
|
};
|
|
10
10
|
use tower_http::{
|
|
11
|
+
catch_panic::CatchPanicLayer,
|
|
12
|
+
compression::CompressionLayer,
|
|
11
13
|
cors::{AllowOrigin, Any, CorsLayer},
|
|
12
14
|
limit::RequestBodyLimitLayer,
|
|
15
|
+
request_id::{MakeRequestUuid, PropagateRequestIdLayer, SetRequestIdLayer},
|
|
16
|
+
sensitive_headers::SetSensitiveHeadersLayer,
|
|
13
17
|
trace::TraceLayer,
|
|
14
18
|
};
|
|
15
19
|
|
|
16
|
-
use crate::{ExtractionConfig, core::ServerConfig};
|
|
20
|
+
use crate::{ExtractionConfig, core::ServerConfig, service::ExtractionServiceBuilder};
|
|
17
21
|
|
|
18
22
|
use super::{
|
|
19
23
|
handlers::{
|
|
@@ -119,8 +123,11 @@ pub fn create_router_with_limits_and_server_config(
|
|
|
119
123
|
limits: ApiSizeLimits,
|
|
120
124
|
server_config: ServerConfig,
|
|
121
125
|
) -> Router {
|
|
126
|
+
let extraction_service = ExtractionServiceBuilder::new().with_tracing().with_metrics().build();
|
|
127
|
+
|
|
122
128
|
let state = ApiState {
|
|
123
129
|
default_config: Arc::new(config),
|
|
130
|
+
extraction_service: Arc::new(std::sync::Mutex::new(extraction_service)),
|
|
124
131
|
};
|
|
125
132
|
|
|
126
133
|
// CORS configuration based on ServerConfig
|
|
@@ -177,6 +184,11 @@ pub fn create_router_with_limits_and_server_config(
|
|
|
177
184
|
.layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
|
|
178
185
|
.layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
|
|
179
186
|
.layer(cors_layer)
|
|
187
|
+
.layer(SetRequestIdLayer::x_request_id(MakeRequestUuid))
|
|
188
|
+
.layer(PropagateRequestIdLayer::x_request_id())
|
|
189
|
+
.layer(CompressionLayer::new())
|
|
190
|
+
.layer(CatchPanicLayer::new())
|
|
191
|
+
.layer(SetSensitiveHeadersLayer::new([axum::http::header::AUTHORIZATION]))
|
|
180
192
|
.layer(TraceLayer::new_for_http())
|
|
181
193
|
.with_state(state)
|
|
182
194
|
}
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
//! API request and response types.
|
|
2
2
|
|
|
3
|
+
use std::sync::{Arc, Mutex};
|
|
4
|
+
|
|
3
5
|
use serde::{Deserialize, Serialize};
|
|
4
|
-
use
|
|
6
|
+
use tower::util::BoxCloneService;
|
|
5
7
|
|
|
6
|
-
use crate::{ExtractionConfig, types::ExtractionResult};
|
|
8
|
+
use crate::{ExtractionConfig, KreuzbergError, service::ExtractionRequest, types::ExtractionResult};
|
|
7
9
|
|
|
8
10
|
/// API server size limit configuration.
|
|
9
11
|
///
|
|
@@ -174,10 +176,16 @@ pub struct ErrorResponse {
|
|
|
174
176
|
///
|
|
175
177
|
/// Holds the default extraction configuration loaded from config file
|
|
176
178
|
/// (via discovery or explicit path). Per-request configs override these defaults.
|
|
177
|
-
#[derive(
|
|
179
|
+
#[derive(Clone)]
|
|
178
180
|
pub struct ApiState {
|
|
179
181
|
/// Default extraction configuration
|
|
180
182
|
pub default_config: Arc<ExtractionConfig>,
|
|
183
|
+
/// Tower service for extraction requests.
|
|
184
|
+
///
|
|
185
|
+
/// Wrapped in `Arc<Mutex>` because `BoxCloneService` is `Send` but not `Sync`,
|
|
186
|
+
/// while `ApiState` must be `Clone + Sync` for Axum's state requirement.
|
|
187
|
+
/// The lock is held only long enough to clone the service.
|
|
188
|
+
pub extraction_service: Arc<Mutex<BoxCloneService<ExtractionRequest, ExtractionResult, KreuzbergError>>>,
|
|
181
189
|
}
|
|
182
190
|
|
|
183
191
|
/// Cache statistics response.
|