kreuzberg 4.6.2 → 4.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.lock +116 -14
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/config.rb +46 -5
  6. data/lib/kreuzberg/version.rb +1 -1
  7. data/sig/kreuzberg.rbs +21 -0
  8. data/vendor/Cargo.toml +3 -3
  9. data/vendor/kreuzberg/Cargo.toml +8 -6
  10. data/vendor/kreuzberg/README.md +1 -1
  11. data/vendor/kreuzberg/src/api/handlers.rs +27 -3
  12. data/vendor/kreuzberg/src/api/router.rs +13 -1
  13. data/vendor/kreuzberg/src/api/types.rs +11 -3
  14. data/vendor/kreuzberg/src/core/config/merge.rs +149 -0
  15. data/vendor/kreuzberg/src/core/config/mod.rs +1 -0
  16. data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -5
  17. data/vendor/kreuzberg/src/core/extractor/file.rs +11 -63
  18. data/vendor/kreuzberg/src/core/extractor/helpers.rs +17 -1
  19. data/vendor/kreuzberg/src/core/pipeline/execution.rs +40 -1
  20. data/vendor/kreuzberg/src/core/pipeline/tests.rs +1 -0
  21. data/vendor/kreuzberg/src/extraction/docx/drawing.rs +52 -1
  22. data/vendor/kreuzberg/src/extractors/archive.rs +0 -28
  23. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -7
  24. data/vendor/kreuzberg/src/extractors/citation.rs +0 -7
  25. data/vendor/kreuzberg/src/extractors/docx.rs +0 -7
  26. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  27. data/vendor/kreuzberg/src/extractors/epub/content.rs +261 -130
  28. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +175 -65
  29. data/vendor/kreuzberg/src/extractors/epub/mod.rs +147 -64
  30. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +76 -9
  31. data/vendor/kreuzberg/src/extractors/excel.rs +0 -13
  32. data/vendor/kreuzberg/src/extractors/html.rs +0 -14
  33. data/vendor/kreuzberg/src/extractors/image.rs +0 -7
  34. data/vendor/kreuzberg/src/extractors/latex/mod.rs +0 -7
  35. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -7
  36. data/vendor/kreuzberg/src/extractors/mdx.rs +0 -7
  37. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +11 -7
  38. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +9 -9
  39. data/vendor/kreuzberg/src/extractors/pptx.rs +0 -13
  40. data/vendor/kreuzberg/src/extractors/pst.rs +0 -13
  41. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +0 -7
  42. data/vendor/kreuzberg/src/extractors/structured.rs +0 -13
  43. data/vendor/kreuzberg/src/extractors/text.rs +0 -7
  44. data/vendor/kreuzberg/src/extractors/typst.rs +0 -7
  45. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  46. data/vendor/kreuzberg/src/layout/models/rtdetr.rs +26 -0
  47. data/vendor/kreuzberg/src/lib.rs +4 -0
  48. data/vendor/kreuzberg/src/mcp/format.rs +2 -58
  49. data/vendor/kreuzberg/src/mcp/server.rs +48 -13
  50. data/vendor/kreuzberg/src/ocr/processor/mod.rs +46 -18
  51. data/vendor/kreuzberg/src/plugins/extractor/instrumented.rs +178 -0
  52. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -0
  53. data/vendor/kreuzberg/src/plugins/mod.rs +1 -1
  54. data/vendor/kreuzberg/src/service/extraction.rs +118 -0
  55. data/vendor/kreuzberg/src/service/layers/metrics.rs +87 -0
  56. data/vendor/kreuzberg/src/service/layers/mod.rs +6 -0
  57. data/vendor/kreuzberg/src/service/layers/tracing.rs +105 -0
  58. data/vendor/kreuzberg/src/service/mod.rs +254 -0
  59. data/vendor/kreuzberg/src/service/request.rs +117 -0
  60. data/vendor/kreuzberg/src/telemetry/conventions.rs +231 -0
  61. data/vendor/kreuzberg/src/telemetry/metrics.rs +113 -0
  62. data/vendor/kreuzberg/src/telemetry/mod.rs +20 -0
  63. data/vendor/kreuzberg/src/telemetry/spans.rs +79 -0
  64. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -1
  65. data/vendor/kreuzberg/tests/epub_spine_semantics_tests.rs +727 -0
  66. data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
  67. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  68. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  69. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  70. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  71. metadata +15 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f087c152499deb621223ba54a8ca450bb0510da3b43c880f3969e33ec4b4d5e5
4
- data.tar.gz: dba38092babe378ec93e0dd4f307ab44c10519623302fdf14023a52eff6549b2
3
+ metadata.gz: 26e800012598eeb04fe01d85d1ff8df63ac8a02a7c6345a4632bb2aae2981300
4
+ data.tar.gz: 6207e53529cbde80bbacd4db9e9e0bd6f6640242e16e1a1cb1543ccfc8ab0291
5
5
  SHA512:
6
- metadata.gz: 2e2cb3a3636555ac5f7d54bf473e36872efef5cf1796c667f727fa8622792a5cec009799d35c5ccb776997b7c39a0e5d0745266c7c5e5604d22b4e3e3e2f5e1f
7
- data.tar.gz: b032174ab9b1366d3d05082ef6cfe3b1b700ff66d265fb28353d69b581aee6989688dd8e01b8c516f7046a8968ca86edd8a80cb72cb48477607eb8104d7facb3
6
+ metadata.gz: 05e510db77e1154b51918b2804549a08a32ee0091bf17ef9f22580391d7eb03044e7e220f398d4f647746b79ec163258ef2862be577662cb542d92a684f37f07
7
+ data.tar.gz: eff1441ce70ff97dec6cbf1ee995aa60fb91a072fee5df9475c6e02072e59aa151fc4a797ddcdebed80c3b53c90f3f8a274392bc8e39c7425cc8fc25d77b101a
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.3" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -61,6 +61,21 @@ dependencies = [
61
61
  "equator",
62
62
  ]
63
63
 
64
+ [[package]]
65
+ name = "alloc-no-stdlib"
66
+ version = "2.0.4"
67
+ source = "registry+https://github.com/rust-lang/crates.io-index"
68
+ checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
69
+
70
+ [[package]]
71
+ name = "alloc-stdlib"
72
+ version = "0.2.2"
73
+ source = "registry+https://github.com/rust-lang/crates.io-index"
74
+ checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
75
+ dependencies = [
76
+ "alloc-no-stdlib",
77
+ ]
78
+
64
79
  [[package]]
65
80
  name = "allocator-api2"
66
81
  version = "0.2.21"
@@ -188,6 +203,18 @@ dependencies = [
188
203
  "memchr",
189
204
  ]
190
205
 
206
+ [[package]]
207
+ name = "async-compression"
208
+ version = "0.4.41"
209
+ source = "registry+https://github.com/rust-lang/crates.io-index"
210
+ checksum = "d0f9ee0f6e02ffd7ad5816e9464499fba7b3effd01123b515c41d1697c43dad1"
211
+ dependencies = [
212
+ "compression-codecs",
213
+ "compression-core",
214
+ "pin-project-lite",
215
+ "tokio",
216
+ ]
217
+
191
218
  [[package]]
192
219
  name = "async-trait"
193
220
  version = "0.1.89"
@@ -517,6 +544,27 @@ dependencies = [
517
544
  "generic-array",
518
545
  ]
519
546
 
547
+ [[package]]
548
+ name = "brotli"
549
+ version = "8.0.2"
550
+ source = "registry+https://github.com/rust-lang/crates.io-index"
551
+ checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560"
552
+ dependencies = [
553
+ "alloc-no-stdlib",
554
+ "alloc-stdlib",
555
+ "brotli-decompressor",
556
+ ]
557
+
558
+ [[package]]
559
+ name = "brotli-decompressor"
560
+ version = "5.0.0"
561
+ source = "registry+https://github.com/rust-lang/crates.io-index"
562
+ checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
563
+ dependencies = [
564
+ "alloc-no-stdlib",
565
+ "alloc-stdlib",
566
+ ]
567
+
520
568
  [[package]]
521
569
  name = "bufrw"
522
570
  version = "0.2.0"
@@ -788,9 +836,9 @@ dependencies = [
788
836
 
789
837
  [[package]]
790
838
  name = "cmake"
791
- version = "0.1.57"
839
+ version = "0.1.58"
792
840
  source = "registry+https://github.com/rust-lang/crates.io-index"
793
- checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d"
841
+ checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
794
842
  dependencies = [
795
843
  "cc",
796
844
  ]
@@ -841,6 +889,26 @@ dependencies = [
841
889
  "static_assertions",
842
890
  ]
843
891
 
892
+ [[package]]
893
+ name = "compression-codecs"
894
+ version = "0.4.37"
895
+ source = "registry+https://github.com/rust-lang/crates.io-index"
896
+ checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7"
897
+ dependencies = [
898
+ "brotli",
899
+ "compression-core",
900
+ "flate2",
901
+ "memchr",
902
+ "zstd",
903
+ "zstd-safe",
904
+ ]
905
+
906
+ [[package]]
907
+ name = "compression-core"
908
+ version = "0.4.31"
909
+ source = "registry+https://github.com/rust-lang/crates.io-index"
910
+ checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
911
+
844
912
  [[package]]
845
913
  name = "console"
846
914
  version = "0.15.11"
@@ -4301,9 +4369,9 @@ dependencies = [
4301
4369
 
4302
4370
  [[package]]
4303
4371
  name = "rmcp"
4304
- version = "1.2.0"
4372
+ version = "1.3.0"
4305
4373
  source = "registry+https://github.com/rust-lang/crates.io-index"
4306
- checksum = "ba6b9d2f0efe2258b23767f1f9e0054cfbcac9c2d6f81a031214143096d7864f"
4374
+ checksum = "2231b2c085b371c01bc90c0e6c1cab8834711b6394533375bdbf870b0166d419"
4307
4375
  dependencies = [
4308
4376
  "async-trait",
4309
4377
  "base64 0.22.1",
@@ -4332,9 +4400,9 @@ dependencies = [
4332
4400
 
4333
4401
  [[package]]
4334
4402
  name = "rmcp-macros"
4335
- version = "1.2.0"
4403
+ version = "1.3.0"
4336
4404
  source = "registry+https://github.com/rust-lang/crates.io-index"
4337
- checksum = "ab9d95d7ed26ad8306352b0d5f05b593222b272790564589790d210aa15caa9e"
4405
+ checksum = "36ea0e100fadf81be85d7ff70f86cd805c7572601d4ab2946207f36540854b43"
4338
4406
  dependencies = [
4339
4407
  "darling 0.23.0",
4340
4408
  "proc-macro2",
@@ -4782,9 +4850,9 @@ dependencies = [
4782
4850
 
4783
4851
  [[package]]
4784
4852
  name = "simd-adler32"
4785
- version = "0.3.8"
4853
+ version = "0.3.9"
4786
4854
  source = "registry+https://github.com/rust-lang/crates.io-index"
4787
- checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2"
4855
+ checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
4788
4856
 
4789
4857
  [[package]]
4790
4858
  name = "simd_helpers"
@@ -5359,6 +5427,7 @@ dependencies = [
5359
5427
  "pin-project-lite",
5360
5428
  "sync_wrapper",
5361
5429
  "tokio",
5430
+ "tokio-util",
5362
5431
  "tower-layer",
5363
5432
  "tower-service",
5364
5433
  "tracing",
@@ -5370,18 +5439,23 @@ version = "0.6.8"
5370
5439
  source = "registry+https://github.com/rust-lang/crates.io-index"
5371
5440
  checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
5372
5441
  dependencies = [
5442
+ "async-compression",
5373
5443
  "bitflags",
5374
5444
  "bytes",
5445
+ "futures-core",
5375
5446
  "futures-util",
5376
5447
  "http",
5377
5448
  "http-body",
5378
5449
  "http-body-util",
5379
5450
  "iri-string",
5380
5451
  "pin-project-lite",
5452
+ "tokio",
5453
+ "tokio-util",
5381
5454
  "tower",
5382
5455
  "tower-layer",
5383
5456
  "tower-service",
5384
5457
  "tracing",
5458
+ "uuid",
5385
5459
  ]
5386
5460
 
5387
5461
  [[package]]
@@ -5535,9 +5609,9 @@ checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
5535
5609
 
5536
5610
  [[package]]
5537
5611
  name = "unicode-segmentation"
5538
- version = "1.13.1"
5612
+ version = "1.13.2"
5539
5613
  source = "registry+https://github.com/rust-lang/crates.io-index"
5540
- checksum = "da36089a805484bcccfffe0739803392c8298778a2d2f09febf76fac5ad9025b"
5614
+ checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c"
5541
5615
 
5542
5616
  [[package]]
5543
5617
  name = "unicode-width"
@@ -5713,9 +5787,9 @@ dependencies = [
5713
5787
 
5714
5788
  [[package]]
5715
5789
  name = "uuid"
5716
- version = "1.22.0"
5790
+ version = "1.23.0"
5717
5791
  source = "registry+https://github.com/rust-lang/crates.io-index"
5718
- checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37"
5792
+ checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9"
5719
5793
  dependencies = [
5720
5794
  "getrandom 0.4.2",
5721
5795
  "js-sys",
@@ -6595,6 +6669,34 @@ dependencies = [
6595
6669
  "simd-adler32",
6596
6670
  ]
6597
6671
 
6672
+ [[package]]
6673
+ name = "zstd"
6674
+ version = "0.13.3"
6675
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6676
+ checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
6677
+ dependencies = [
6678
+ "zstd-safe",
6679
+ ]
6680
+
6681
+ [[package]]
6682
+ name = "zstd-safe"
6683
+ version = "7.2.4"
6684
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6685
+ checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
6686
+ dependencies = [
6687
+ "zstd-sys",
6688
+ ]
6689
+
6690
+ [[package]]
6691
+ name = "zstd-sys"
6692
+ version = "2.0.16+zstd.1.5.7"
6693
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6694
+ checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
6695
+ dependencies = [
6696
+ "cc",
6697
+ "pkg-config",
6698
+ ]
6699
+
6598
6700
  [[package]]
6599
6701
  name = "zune-core"
6600
6702
  version = "0.5.1"
@@ -6612,9 +6714,9 @@ dependencies = [
6612
6714
 
6613
6715
  [[package]]
6614
6716
  name = "zune-jpeg"
6615
- version = "0.5.14"
6717
+ version = "0.5.15"
6616
6718
  source = "registry+https://github.com/rust-lang/crates.io-index"
6617
- checksum = "0b7a1c0af6e5d8d1363f4994b7a091ccf963d8b694f7da5b0b9cceb82da2c0a6"
6719
+ checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296"
6618
6720
  dependencies = [
6619
6721
  "zune-core",
6620
6722
  ]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.6.2"
3
+ version = "4.6.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -837,6 +837,25 @@ module Kreuzberg
837
837
  end
838
838
  end
839
839
 
840
+ # Email extraction configuration
841
+ #
842
+ # @example With fallback codepage
843
+ # email = Email.new(msg_fallback_codepage: 1251)
844
+ #
845
+ class Email
846
+ attr_reader :msg_fallback_codepage
847
+
848
+ def initialize(msg_fallback_codepage: nil)
849
+ @msg_fallback_codepage = msg_fallback_codepage&.to_i
850
+ end
851
+
852
+ def to_h
853
+ h = {}
854
+ h[:msg_fallback_codepage] = @msg_fallback_codepage unless @msg_fallback_codepage.nil?
855
+ h
856
+ end
857
+ end
858
+
840
859
  # Layout detection configuration
841
860
  #
842
861
  # @example Basic usage with fast preset
@@ -933,7 +952,8 @@ module Kreuzberg
933
952
  :token_reduction, :keywords, :html_options, :pages,
934
953
  :max_concurrent_extractions, :output_format, :result_format,
935
954
  :security_limits, :layout, :concurrency,
936
- :cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
955
+ :cache_namespace, :cache_ttl_secs, :extraction_timeout_secs,
956
+ :max_archive_depth, :acceleration, :email
937
957
 
938
958
  # Alias for backward compatibility - image_extraction is the canonical name
939
959
  alias image_extraction images
@@ -959,6 +979,7 @@ module Kreuzberg
959
979
  postprocessor token_reduction keywords html_options pages
960
980
  max_concurrent_extractions output_format result_format
961
981
  security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
982
+ max_archive_depth acceleration email
962
983
  ].freeze
963
984
 
964
985
  # Aliases for backward compatibility
@@ -1015,7 +1036,7 @@ module Kreuzberg
1015
1036
  new(**normalize_hash_keys(hash))
1016
1037
  end
1017
1038
 
1018
- def initialize(hash = nil,
1039
+ def initialize(hash = nil, # rubocop:disable Metrics/MethodLength
1019
1040
  use_cache: true,
1020
1041
  enable_quality_processing: true,
1021
1042
  force_ocr: false,
@@ -1039,7 +1060,10 @@ module Kreuzberg
1039
1060
  concurrency: nil,
1040
1061
  cache_namespace: nil,
1041
1062
  cache_ttl_secs: nil,
1042
- extraction_timeout_secs: nil)
1063
+ extraction_timeout_secs: nil,
1064
+ max_archive_depth: 3,
1065
+ acceleration: nil,
1066
+ email: nil)
1043
1067
  kwargs = {
1044
1068
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
1045
1069
  force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
@@ -1054,7 +1078,10 @@ module Kreuzberg
1054
1078
  concurrency: concurrency,
1055
1079
  cache_namespace: cache_namespace,
1056
1080
  cache_ttl_secs: cache_ttl_secs,
1057
- extraction_timeout_secs: extraction_timeout_secs
1081
+ extraction_timeout_secs: extraction_timeout_secs,
1082
+ max_archive_depth: max_archive_depth,
1083
+ acceleration: acceleration,
1084
+ email: email
1058
1085
  }
1059
1086
  extracted = extract_from_hash(hash, kwargs)
1060
1087
 
@@ -1086,7 +1113,10 @@ module Kreuzberg
1086
1113
  @pages = normalize_config(params[:pages], PageConfig)
1087
1114
  @layout = normalize_config(params[:layout], LayoutDetection)
1088
1115
  @concurrency = normalize_config(params[:concurrency], Concurrency)
1116
+ @acceleration = normalize_config(params[:acceleration], Acceleration)
1117
+ @email = normalize_config(params[:email], Email)
1089
1118
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
1119
+ @max_archive_depth = params[:max_archive_depth]&.to_i || 3
1090
1120
  @output_format = validate_output_format(params[:output_format])
1091
1121
  @result_format = validate_result_format(params[:result_format])
1092
1122
  @cache_namespace = params[:cache_namespace]
@@ -1127,6 +1157,7 @@ module Kreuzberg
1127
1157
  force_ocr_pages: @force_ocr_pages,
1128
1158
  include_document_structure: @include_document_structure,
1129
1159
  max_concurrent_extractions: @max_concurrent_extractions,
1160
+ max_archive_depth: @max_archive_depth,
1130
1161
  output_format: @output_format,
1131
1162
  result_format: @result_format,
1132
1163
  cache_namespace: @cache_namespace,
@@ -1142,7 +1173,8 @@ module Kreuzberg
1142
1173
  image_extraction: @images&.to_h, postprocessor: @postprocessor&.to_h,
1143
1174
  token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
1144
1175
  html_options: @html_options&.to_h, pages: @pages&.to_h,
1145
- layout: @layout&.to_h, concurrency: @concurrency&.to_h
1176
+ layout: @layout&.to_h, concurrency: @concurrency&.to_h,
1177
+ acceleration: @acceleration&.to_h, email: @email&.to_h
1146
1178
  }
1147
1179
  end
1148
1180
 
@@ -1286,6 +1318,12 @@ module Kreuzberg
1286
1318
  @layout = normalize_config(value, LayoutDetection)
1287
1319
  when :concurrency
1288
1320
  @concurrency = normalize_config(value, Concurrency)
1321
+ when :acceleration
1322
+ @acceleration = normalize_config(value, Acceleration)
1323
+ when :email
1324
+ @email = normalize_config(value, Email)
1325
+ when :max_archive_depth
1326
+ @max_archive_depth = value&.to_i || 3
1289
1327
  when :max_concurrent_extractions
1290
1328
  @max_concurrent_extractions = value&.to_i
1291
1329
  when :output_format
@@ -1373,6 +1411,9 @@ module Kreuzberg
1373
1411
  @html_options = merged.html_options
1374
1412
  @pages = merged.pages
1375
1413
  @layout = merged.layout
1414
+ @acceleration = merged.acceleration
1415
+ @email = merged.email
1416
+ @max_archive_depth = merged.max_archive_depth
1376
1417
  end
1377
1418
 
1378
1419
  def update_output_options(merged)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.6.2'
4
+ VERSION = '4.6.3'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -459,6 +459,21 @@ module Kreuzberg
459
459
  def to_h: () -> Hash[Symbol, untyped]
460
460
  end
461
461
 
462
+ class Acceleration
463
+ attr_reader provider: String
464
+ attr_reader device_id: Integer
465
+
466
+ def initialize: (?provider: String, ?device_id: Integer) -> void
467
+ def to_h: () -> Hash[Symbol, untyped]
468
+ end
469
+
470
+ class Email
471
+ attr_reader msg_fallback_codepage: Integer?
472
+
473
+ def initialize: (?msg_fallback_codepage: Integer?) -> void
474
+ def to_h: () -> Hash[Symbol, untyped]
475
+ end
476
+
462
477
  class LayoutDetection
463
478
  attr_reader preset: String
464
479
  attr_reader confidence_threshold: Float?
@@ -497,7 +512,10 @@ module Kreuzberg
497
512
  attr_reader pages: PageConfig?
498
513
  attr_reader layout: LayoutDetection?
499
514
  attr_reader concurrency: Concurrency?
515
+ attr_reader acceleration: Acceleration?
516
+ attr_reader email: Email?
500
517
  attr_reader max_concurrent_extractions: Integer?
518
+ attr_reader max_archive_depth: Integer
501
519
  attr_reader output_format: String?
502
520
  attr_reader result_format: String?
503
521
  attr_reader security_limits: Hash[String, Integer]?
@@ -524,7 +542,10 @@ module Kreuzberg
524
542
  ?pages: (PageConfig | Hash[Symbol, untyped])?,
525
543
  ?layout: (LayoutDetection | Hash[Symbol, untyped])?,
526
544
  ?concurrency: (Concurrency | Hash[Symbol, untyped])?,
545
+ ?acceleration: (Acceleration | Hash[Symbol, untyped])?,
546
+ ?email: (Email | Hash[Symbol, untyped])?,
527
547
  ?max_concurrent_extractions: Integer?,
548
+ ?max_archive_depth: Integer,
528
549
  ?output_format: String?,
529
550
  ?result_format: String?,
530
551
  ?cache_namespace: String?,
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.6.2"
5
+ version = "4.6.3"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -30,8 +30,8 @@ html-to-markdown-rs = { version = "2.29.0", default-features = false }
30
30
  image = { version = "0.25.10", default-features = false }
31
31
  itertools = "0.14"
32
32
  js-sys = "0.3"
33
- kreuzberg = { path = "./crates/kreuzberg", version = "4.6.2", default-features = false }
34
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.2" }
33
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.6.3", default-features = false }
34
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.3" }
35
35
  lazy_static = "1.5.0"
36
36
  libc = "0.2.183"
37
37
  log = "0.4"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.6.2"
3
+ version = "4.6.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -135,15 +135,17 @@ keywords-yake = ["dep:yake-rust", "stopwords"]
135
135
  keywords-rake = ["dep:rake", "stopwords"]
136
136
  keywords = ["keywords-yake", "keywords-rake"]
137
137
 
138
+ tower-service = ["dep:tower", "tokio-runtime"]
139
+
138
140
  api = [
141
+ "tower-service",
139
142
  "dep:axum",
140
- "dep:tower",
141
143
  "dep:tower-http",
142
144
  "dep:utoipa",
143
145
  "tokio-runtime",
144
146
  "chunking",
145
147
  ]
146
- mcp = ["dep:rmcp", "tokio-runtime"]
148
+ mcp = ["tower-service", "dep:rmcp", "tokio-runtime"]
147
149
  mcp-http = ["mcp", "api"]
148
150
 
149
151
  otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
@@ -298,7 +300,7 @@ quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
298
300
  rake = { version = "0.3.6", optional = true }
299
301
  rayon = "1.11.0"
300
302
  regex = "1.12.3"
301
- rmcp = { version = "1.2.0", features = [
303
+ rmcp = { version = "1.3.0", features = [
302
304
  "server",
303
305
  "macros",
304
306
  "base64",
@@ -327,8 +329,8 @@ tokenizers = { version = "0.22", optional = true, default-features = false, feat
327
329
  ] }
328
330
  tokio = { version = "1.50.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
329
331
  toml = "1.1.0"
330
- tower = { version = "0.5", optional = true }
331
- tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
332
+ tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
333
+ tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
332
334
  tracing = "0.1"
333
335
  tracing-opentelemetry = { version = "0.32", optional = true }
334
336
  unicode-normalization = { version = "0.1.25", optional = true }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.6.2 Release**
21
+ > **🚀 Version 4.6.3 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -2,7 +2,9 @@
2
2
 
3
3
  use axum::{Json, extract::State};
4
4
 
5
- use crate::{batch_extract_bytes, cache, extract_bytes};
5
+ use tower::Service;
6
+
7
+ use crate::{batch_extract_bytes, cache, service::ExtractionRequest};
6
8
 
7
9
  use super::{
8
10
  error::{ApiError, JsonApi, MultipartApi},
@@ -201,7 +203,13 @@ pub async fn extract_handler(
201
203
  .into_iter()
202
204
  .next()
203
205
  .expect("files.len() == 1 guarantees one element exists");
204
- let result = extract_bytes(&data, mime_type.as_str(), final_config).await?;
206
+ let request = ExtractionRequest::bytes(data, mime_type, final_config.clone());
207
+ let mut svc = state
208
+ .extraction_service
209
+ .lock()
210
+ .expect("extraction service lock poisoned")
211
+ .clone();
212
+ let result = svc.call(request).await?;
205
213
  return Ok(Json(vec![result]));
206
214
  }
207
215
 
@@ -210,7 +218,21 @@ pub async fn extract_handler(
210
218
  .map(|(data, mime, _name)| (data, mime, None))
211
219
  .collect();
212
220
 
213
- let results = batch_extract_bytes(files_data, final_config).await?;
221
+ #[cfg(feature = "otel")]
222
+ let batch_span = tracing::info_span!(
223
+ "kreuzberg.service",
224
+ { crate::telemetry::conventions::OPERATION } = crate::telemetry::conventions::operations::BATCH_EXTRACT,
225
+ { crate::telemetry::conventions::BATCH_SIZE } = files_data.len(),
226
+ );
227
+ #[cfg(not(feature = "otel"))]
228
+ let batch_span = tracing::Span::none();
229
+
230
+ let results = {
231
+ use tracing::Instrument;
232
+ batch_extract_bytes(files_data, final_config)
233
+ .instrument(batch_span)
234
+ .await?
235
+ };
214
236
  Ok(Json(results))
215
237
  }
216
238
 
@@ -878,8 +900,10 @@ mod tests {
878
900
  use tower::ServiceExt;
879
901
 
880
902
  fn test_router() -> Router {
903
+ let extraction_service = crate::service::ExtractionServiceBuilder::new().build();
881
904
  let state = ApiState {
882
905
  default_config: std::sync::Arc::new(crate::ExtractionConfig::default()),
906
+ extraction_service: std::sync::Arc::new(std::sync::Mutex::new(extraction_service)),
883
907
  };
884
908
  Router::new()
885
909
  .route("/version", get(version_handler))
@@ -8,12 +8,16 @@ use axum::{
8
8
  routing::{delete, get, post},
9
9
  };
10
10
  use tower_http::{
11
+ catch_panic::CatchPanicLayer,
12
+ compression::CompressionLayer,
11
13
  cors::{AllowOrigin, Any, CorsLayer},
12
14
  limit::RequestBodyLimitLayer,
15
+ request_id::{MakeRequestUuid, PropagateRequestIdLayer, SetRequestIdLayer},
16
+ sensitive_headers::SetSensitiveHeadersLayer,
13
17
  trace::TraceLayer,
14
18
  };
15
19
 
16
- use crate::{ExtractionConfig, core::ServerConfig};
20
+ use crate::{ExtractionConfig, core::ServerConfig, service::ExtractionServiceBuilder};
17
21
 
18
22
  use super::{
19
23
  handlers::{
@@ -119,8 +123,11 @@ pub fn create_router_with_limits_and_server_config(
119
123
  limits: ApiSizeLimits,
120
124
  server_config: ServerConfig,
121
125
  ) -> Router {
126
+ let extraction_service = ExtractionServiceBuilder::new().with_tracing().with_metrics().build();
127
+
122
128
  let state = ApiState {
123
129
  default_config: Arc::new(config),
130
+ extraction_service: Arc::new(std::sync::Mutex::new(extraction_service)),
124
131
  };
125
132
 
126
133
  // CORS configuration based on ServerConfig
@@ -177,6 +184,11 @@ pub fn create_router_with_limits_and_server_config(
177
184
  .layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
178
185
  .layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
179
186
  .layer(cors_layer)
187
+ .layer(SetRequestIdLayer::x_request_id(MakeRequestUuid))
188
+ .layer(PropagateRequestIdLayer::x_request_id())
189
+ .layer(CompressionLayer::new())
190
+ .layer(CatchPanicLayer::new())
191
+ .layer(SetSensitiveHeadersLayer::new([axum::http::header::AUTHORIZATION]))
180
192
  .layer(TraceLayer::new_for_http())
181
193
  .with_state(state)
182
194
  }
@@ -1,9 +1,11 @@
1
1
  //! API request and response types.
2
2
 
3
+ use std::sync::{Arc, Mutex};
4
+
3
5
  use serde::{Deserialize, Serialize};
4
- use std::sync::Arc;
6
+ use tower::util::BoxCloneService;
5
7
 
6
- use crate::{ExtractionConfig, types::ExtractionResult};
8
+ use crate::{ExtractionConfig, KreuzbergError, service::ExtractionRequest, types::ExtractionResult};
7
9
 
8
10
  /// API server size limit configuration.
9
11
  ///
@@ -174,10 +176,16 @@ pub struct ErrorResponse {
174
176
  ///
175
177
  /// Holds the default extraction configuration loaded from config file
176
178
  /// (via discovery or explicit path). Per-request configs override these defaults.
177
- #[derive(Debug, Clone)]
179
+ #[derive(Clone)]
178
180
  pub struct ApiState {
179
181
  /// Default extraction configuration
180
182
  pub default_config: Arc<ExtractionConfig>,
183
+ /// Tower service for extraction requests.
184
+ ///
185
+ /// Wrapped in `Arc<Mutex>` because `BoxCloneService` is `Send` but not `Sync`,
186
+ /// while `ApiState` must be `Clone + Sync` for Axum's state requirement.
187
+ /// The lock is held only long enough to clone the service.
188
+ pub extraction_service: Arc<Mutex<BoxCloneService<ExtractionRequest, ExtractionResult, KreuzbergError>>>,
181
189
  }
182
190
 
183
191
  /// Cache statistics response.