kreuzberg 4.5.1 → 4.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +28 -1
  5. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  6. data/lib/kreuzberg/config.rb +20 -5
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/sig/kreuzberg.rbs +5 -1
  9. data/vendor/Cargo.toml +4 -3
  10. data/vendor/kreuzberg/Cargo.toml +2 -1
  11. data/vendor/kreuzberg/README.md +1 -1
  12. data/vendor/kreuzberg/src/api/handlers.rs +483 -2
  13. data/vendor/kreuzberg/src/api/mod.rs +7 -2
  14. data/vendor/kreuzberg/src/api/openapi.rs +19 -0
  15. data/vendor/kreuzberg/src/api/router.rs +7 -3
  16. data/vendor/kreuzberg/src/api/types.rs +75 -0
  17. data/vendor/kreuzberg/src/cache/core.rs +223 -122
  18. data/vendor/kreuzberg/src/cache/mod.rs +20 -16
  19. data/vendor/kreuzberg/src/cache/utilities.rs +62 -44
  20. data/vendor/kreuzberg/src/core/config/extraction/core.rs +18 -0
  21. data/vendor/kreuzberg/src/core/extractor/file.rs +79 -0
  22. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +14 -3
  23. data/vendor/kreuzberg/src/layout/engine.rs +3 -0
  24. data/vendor/kreuzberg/src/mcp/mod.rs +9 -1
  25. data/vendor/kreuzberg/src/mcp/params.rs +87 -0
  26. data/vendor/kreuzberg/src/mcp/server.rs +585 -5
  27. data/vendor/kreuzberg/src/ocr/cache.rs +1 -1
  28. data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
  29. data/vendor/kreuzberg/src/ocr/processor/config.rs +21 -23
  30. data/vendor/kreuzberg/src/ocr/processor/execution.rs +6 -25
  31. data/vendor/kreuzberg/src/ocr/processor/validation.rs +29 -9
  32. data/vendor/kreuzberg/src/ocr/tessdata_manager.rs +254 -0
  33. data/vendor/kreuzberg/src/ocr/utils.rs +6 -10
  34. data/vendor/kreuzberg/src/pdf/layout_runner.rs +11 -0
  35. data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +9 -1
  36. data/vendor/kreuzberg/src/pdf/markdown/classify.rs +98 -6
  37. data/vendor/kreuzberg/src/pdf/markdown/mod.rs +1 -1
  38. data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +43 -0
  39. data/vendor/kreuzberg/src/pdf/markdown/regions/tables.rs +11 -1
  40. data/vendor/kreuzberg/src/pdf/markdown/render.rs +22 -16
  41. data/vendor/kreuzberg/src/pdf/markdown/text_repair.rs +209 -47
  42. data/vendor/kreuzberg/src/pdf/oxide_text.rs +10 -1
  43. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  44. data/vendor/kreuzberg/src/pdf/text_data.rs +15 -6
  45. data/vendor/kreuzberg/tests/instrumentation_test.rs +2 -2
  46. data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
  47. data/vendor/kreuzberg-ffi/kreuzberg.h +46 -2
  48. data/vendor/kreuzberg-ffi/src/config_builder.rs +81 -0
  49. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  50. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  51. data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text/segment.rs +13 -0
  52. data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text.rs +148 -0
  53. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  54. data/vendor/kreuzberg-tesseract/build.rs +61 -0
  55. metadata +2 -5
  56. data/vendor/kreuzberg/src/mcp/tools/cache.rs +0 -179
  57. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +0 -431
  58. data/vendor/kreuzberg/src/mcp/tools/mime.rs +0 -150
  59. data/vendor/kreuzberg/src/mcp/tools/mod.rs +0 -11
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9d245ca9cadfb5b07cab9c8709854cfd0eb488684b6c5fbc6866b891f162b0d0
4
- data.tar.gz: 31ab57a13cef6881bc58c52058b1ade5ca13af00ed5d81df1fc79853ca816e8d
3
+ metadata.gz: 2f29e7c9b7614fc78e0c54f673a804f79625081faa79317da54647937fe51a46
4
+ data.tar.gz: b465d7be3c677c7a7a87eb888503f57b7cf42e5bac353418a191cc2629ad3d5c
5
5
  SHA512:
6
- metadata.gz: 5cada46dd61ecb89dd9a7ffdeeb5df86ca338b5864785d80753e12f0967d23e7183360dde623732c8579e9ba78a9b8ff26bdc978ee744a842a75c33bb3877784
7
- data.tar.gz: 12b1b780c4065379cb7d0fde912bebfac9ae5ea48a33600e4331d705c65d58a05393fa42902e0d1d7b0d5dd2ba98b1dd231fc29a9819a60cb1a8f398146e1b9d
6
+ metadata.gz: fc25d857d8252f4759ed2ea07003107843182c87d855872da228f599371cdb9f705d2883995bc17ac6dd2fadf12d6aa2023eb1abf6f69f5e2844b1a90473cb02
7
+ data.tar.gz: 5fe146eebe572f4a6b5ac89d9e187b97eb72787493b4748ba66c968014cbc7757b0ee6bec64516968ff7419a0c6c4c5b57e0b88240a61cde454ac72fa0fed9e7
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.5.1)
4
+ kreuzberg (4.5.2)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -222,7 +222,7 @@ CHECKSUMS
222
222
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
223
223
  json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
224
224
  json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
225
- kreuzberg (4.5.1)
225
+ kreuzberg (4.5.2)
226
226
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
227
227
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
228
228
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.1" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.2" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -161,6 +161,12 @@ dependencies = [
161
161
  "syn",
162
162
  ]
163
163
 
164
+ [[package]]
165
+ name = "arrayref"
166
+ version = "0.3.9"
167
+ source = "registry+https://github.com/rust-lang/crates.io-index"
168
+ checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
169
+
164
170
  [[package]]
165
171
  name = "arrayvec"
166
172
  version = "0.7.6"
@@ -473,6 +479,20 @@ dependencies = [
473
479
  "wyz",
474
480
  ]
475
481
 
482
+ [[package]]
483
+ name = "blake3"
484
+ version = "1.8.3"
485
+ source = "registry+https://github.com/rust-lang/crates.io-index"
486
+ checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d"
487
+ dependencies = [
488
+ "arrayref",
489
+ "arrayvec",
490
+ "cc",
491
+ "cfg-if",
492
+ "constant_time_eq 0.4.2",
493
+ "cpufeatures 0.2.17",
494
+ ]
495
+
476
496
  [[package]]
477
497
  name = "block-buffer"
478
498
  version = "0.10.4"
@@ -916,6 +936,12 @@ version = "0.3.1"
916
936
  source = "registry+https://github.com/rust-lang/crates.io-index"
917
937
  checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
918
938
 
939
+ [[package]]
940
+ name = "constant_time_eq"
941
+ version = "0.4.2"
942
+ source = "registry+https://github.com/rust-lang/crates.io-index"
943
+ checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
944
+
919
945
  [[package]]
920
946
  name = "cookie"
921
947
  version = "0.18.1"
@@ -2790,6 +2816,7 @@ dependencies = [
2790
2816
  "biblatex",
2791
2817
  "biblib",
2792
2818
  "bitvec",
2819
+ "blake3",
2793
2820
  "bytes",
2794
2821
  "calamine",
2795
2822
  "cfb 0.14.0",
@@ -6784,7 +6811,7 @@ dependencies = [
6784
6811
  "aes",
6785
6812
  "arbitrary",
6786
6813
  "bzip2 0.5.2",
6787
- "constant_time_eq",
6814
+ "constant_time_eq 0.3.1",
6788
6815
  "crc32fast",
6789
6816
  "crossbeam-utils",
6790
6817
  "deflate64",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.5.1"
3
+ version = "4.5.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -930,7 +930,8 @@ module Kreuzberg
930
930
  :images, :postprocessor,
931
931
  :token_reduction, :keywords, :html_options, :pages,
932
932
  :max_concurrent_extractions, :output_format, :result_format,
933
- :security_limits, :layout, :concurrency
933
+ :security_limits, :layout, :concurrency,
934
+ :cache_namespace, :cache_ttl_secs
934
935
 
935
936
  # Alias for backward compatibility - image_extraction is the canonical name
936
937
  alias image_extraction images
@@ -955,7 +956,7 @@ module Kreuzberg
955
956
  language_detection pdf_options image_extraction
956
957
  postprocessor token_reduction keywords html_options pages
957
958
  max_concurrent_extractions output_format result_format
958
- security_limits layout concurrency
959
+ security_limits layout concurrency cache_namespace cache_ttl_secs
959
960
  ].freeze
960
961
 
961
962
  # Aliases for backward compatibility
@@ -1032,7 +1033,9 @@ module Kreuzberg
1032
1033
  result_format: nil,
1033
1034
  security_limits: nil,
1034
1035
  layout: nil,
1035
- concurrency: nil)
1036
+ concurrency: nil,
1037
+ cache_namespace: nil,
1038
+ cache_ttl_secs: nil)
1036
1039
  kwargs = {
1037
1040
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
1038
1041
  force_ocr: force_ocr, include_document_structure: include_document_structure,
@@ -1043,7 +1046,9 @@ module Kreuzberg
1043
1046
  pages: pages, max_concurrent_extractions: max_concurrent_extractions,
1044
1047
  output_format: output_format, result_format: result_format,
1045
1048
  security_limits: security_limits, layout: layout,
1046
- concurrency: concurrency
1049
+ concurrency: concurrency,
1050
+ cache_namespace: cache_namespace,
1051
+ cache_ttl_secs: cache_ttl_secs
1047
1052
  }
1048
1053
  extracted = extract_from_hash(hash, kwargs)
1049
1054
 
@@ -1077,6 +1082,8 @@ module Kreuzberg
1077
1082
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
1078
1083
  @output_format = validate_output_format(params[:output_format])
1079
1084
  @result_format = validate_result_format(params[:result_format])
1085
+ @cache_namespace = params[:cache_namespace]
1086
+ @cache_ttl_secs = params[:cache_ttl_secs]&.to_i
1080
1087
  @security_limits = params[:security_limits]
1081
1088
  end
1082
1089
 
@@ -1112,7 +1119,9 @@ module Kreuzberg
1112
1119
  include_document_structure: @include_document_structure,
1113
1120
  max_concurrent_extractions: @max_concurrent_extractions,
1114
1121
  output_format: @output_format,
1115
- result_format: @result_format
1122
+ result_format: @result_format,
1123
+ cache_namespace: @cache_namespace,
1124
+ cache_ttl_secs: @cache_ttl_secs
1116
1125
  }
1117
1126
  end
1118
1127
 
@@ -1271,6 +1280,10 @@ module Kreuzberg
1271
1280
  @output_format = validate_output_format(value)
1272
1281
  when :result_format
1273
1282
  @result_format = validate_result_format(value)
1283
+ when :cache_namespace
1284
+ @cache_namespace = value
1285
+ when :cache_ttl_secs
1286
+ @cache_ttl_secs = value&.to_i
1274
1287
  else
1275
1288
  raise ArgumentError, "Unknown configuration key: #{key}"
1276
1289
  end
@@ -1352,6 +1365,8 @@ module Kreuzberg
1352
1365
  @max_concurrent_extractions = merged.max_concurrent_extractions
1353
1366
  @output_format = merged.output_format
1354
1367
  @result_format = merged.result_format
1368
+ @cache_namespace = merged.cache_namespace
1369
+ @cache_ttl_secs = merged.cache_ttl_secs
1355
1370
  end
1356
1371
  end
1357
1372
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.5.1'
4
+ VERSION = '4.5.2'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -478,6 +478,8 @@ module Kreuzberg
478
478
  class Extraction
479
479
  attr_reader use_cache: bool
480
480
  attr_reader enable_quality_processing: bool
481
+ attr_reader cache_namespace: String?
482
+ attr_reader cache_ttl_secs: Integer?
481
483
  attr_reader force_ocr: bool
482
484
  attr_reader include_document_structure: bool
483
485
  attr_reader ocr: OCR?
@@ -520,7 +522,9 @@ module Kreuzberg
520
522
  ?concurrency: (Concurrency | Hash[Symbol, untyped])?,
521
523
  ?max_concurrent_extractions: Integer?,
522
524
  ?output_format: String?,
523
- ?result_format: String?
525
+ ?result_format: String?,
526
+ ?cache_namespace: String?,
527
+ ?cache_ttl_secs: Integer?
524
528
  ) -> void
525
529
  def to_h: () -> Hash[Symbol, untyped]
526
530
  def to_json: (*untyped) -> String
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.5.1"
5
+ version = "4.5.2"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -15,6 +15,7 @@ ahash = "0.8.12"
15
15
  anyhow = "1.0"
16
16
  async-trait = "0.1.89"
17
17
  base64 = "0.22.1"
18
+ blake3 = "1"
18
19
  bytes = { version = "1", features = ["serde"] }
19
20
  chrono = "0.4"
20
21
  clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
@@ -29,8 +30,8 @@ hwpers = "0.5"
29
30
  image = { version = "0.25.10", default-features = false }
30
31
  itertools = "0.14"
31
32
  js-sys = "0.3"
32
- kreuzberg = { path = "./crates/kreuzberg", version = "4.5.1", default-features = false }
33
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.1" }
33
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.5.2", default-features = false }
34
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.2" }
34
35
  lazy_static = "1.5.0"
35
36
  libc = "0.2.183"
36
37
  log = "0.4"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.5.1"
3
+ version = "4.5.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -192,6 +192,7 @@ pkg-config = "0.3"
192
192
  [dependencies]
193
193
  ahash = "0.8.12"
194
194
  async-trait = "0.1.89"
195
+ blake3 = "1"
195
196
  base64 = "0.22.1"
196
197
  bitvec = "1.0"
197
198
  bytes = { version = "1", features = ["serde"] }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.5.1 Release**
21
+ > **🚀 Version 4.5.2 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.