kreuzberg 4.5.1 → 4.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +28 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/config.rb +20 -5
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/vendor/Cargo.toml +4 -3
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/handlers.rs +483 -2
- data/vendor/kreuzberg/src/api/mod.rs +7 -2
- data/vendor/kreuzberg/src/api/openapi.rs +19 -0
- data/vendor/kreuzberg/src/api/router.rs +7 -3
- data/vendor/kreuzberg/src/api/types.rs +75 -0
- data/vendor/kreuzberg/src/cache/core.rs +223 -122
- data/vendor/kreuzberg/src/cache/mod.rs +20 -16
- data/vendor/kreuzberg/src/cache/utilities.rs +62 -44
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +18 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +79 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +14 -3
- data/vendor/kreuzberg/src/layout/engine.rs +3 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -1
- data/vendor/kreuzberg/src/mcp/params.rs +87 -0
- data/vendor/kreuzberg/src/mcp/server.rs +585 -5
- data/vendor/kreuzberg/src/ocr/cache.rs +1 -1
- data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +21 -23
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +6 -25
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +29 -9
- data/vendor/kreuzberg/src/ocr/tessdata_manager.rs +254 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +6 -10
- data/vendor/kreuzberg/src/pdf/layout_runner.rs +11 -0
- data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +9 -1
- data/vendor/kreuzberg/src/pdf/markdown/classify.rs +98 -6
- data/vendor/kreuzberg/src/pdf/markdown/mod.rs +1 -1
- data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +43 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/tables.rs +11 -1
- data/vendor/kreuzberg/src/pdf/markdown/render.rs +22 -16
- data/vendor/kreuzberg/src/pdf/markdown/text_repair.rs +209 -47
- data/vendor/kreuzberg/src/pdf/oxide_text.rs +10 -1
- data/vendor/kreuzberg/src/pdf/text.rs +2 -2
- data/vendor/kreuzberg/src/pdf/text_data.rs +15 -6
- data/vendor/kreuzberg/tests/instrumentation_test.rs +2 -2
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +46 -2
- data/vendor/kreuzberg-ffi/src/config_builder.rs +81 -0
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text/segment.rs +13 -0
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text.rs +148 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/build.rs +61 -0
- metadata +2 -5
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +0 -179
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +0 -431
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +0 -150
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +0 -11
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2f29e7c9b7614fc78e0c54f673a804f79625081faa79317da54647937fe51a46
|
|
4
|
+
data.tar.gz: b465d7be3c677c7a7a87eb888503f57b7cf42e5bac353418a191cc2629ad3d5c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fc25d857d8252f4759ed2ea07003107843182c87d855872da228f599371cdb9f705d2883995bc17ac6dd2fadf12d6aa2023eb1abf6f69f5e2844b1a90473cb02
|
|
7
|
+
data.tar.gz: 5fe146eebe572f4a6b5ac89d9e187b97eb72787493b4748ba66c968014cbc7757b0ee6bec64516968ff7419a0c6c4c5b57e0b88240a61cde454ac72fa0fed9e7
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.5.
|
|
4
|
+
kreuzberg (4.5.2)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -222,7 +222,7 @@ CHECKSUMS
|
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
223
|
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.5.
|
|
225
|
+
kreuzberg (4.5.2)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -161,6 +161,12 @@ dependencies = [
|
|
|
161
161
|
"syn",
|
|
162
162
|
]
|
|
163
163
|
|
|
164
|
+
[[package]]
|
|
165
|
+
name = "arrayref"
|
|
166
|
+
version = "0.3.9"
|
|
167
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
168
|
+
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
|
|
169
|
+
|
|
164
170
|
[[package]]
|
|
165
171
|
name = "arrayvec"
|
|
166
172
|
version = "0.7.6"
|
|
@@ -473,6 +479,20 @@ dependencies = [
|
|
|
473
479
|
"wyz",
|
|
474
480
|
]
|
|
475
481
|
|
|
482
|
+
[[package]]
|
|
483
|
+
name = "blake3"
|
|
484
|
+
version = "1.8.3"
|
|
485
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
486
|
+
checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d"
|
|
487
|
+
dependencies = [
|
|
488
|
+
"arrayref",
|
|
489
|
+
"arrayvec",
|
|
490
|
+
"cc",
|
|
491
|
+
"cfg-if",
|
|
492
|
+
"constant_time_eq 0.4.2",
|
|
493
|
+
"cpufeatures 0.2.17",
|
|
494
|
+
]
|
|
495
|
+
|
|
476
496
|
[[package]]
|
|
477
497
|
name = "block-buffer"
|
|
478
498
|
version = "0.10.4"
|
|
@@ -916,6 +936,12 @@ version = "0.3.1"
|
|
|
916
936
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
917
937
|
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
|
|
918
938
|
|
|
939
|
+
[[package]]
|
|
940
|
+
name = "constant_time_eq"
|
|
941
|
+
version = "0.4.2"
|
|
942
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
943
|
+
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
|
|
944
|
+
|
|
919
945
|
[[package]]
|
|
920
946
|
name = "cookie"
|
|
921
947
|
version = "0.18.1"
|
|
@@ -2790,6 +2816,7 @@ dependencies = [
|
|
|
2790
2816
|
"biblatex",
|
|
2791
2817
|
"biblib",
|
|
2792
2818
|
"bitvec",
|
|
2819
|
+
"blake3",
|
|
2793
2820
|
"bytes",
|
|
2794
2821
|
"calamine",
|
|
2795
2822
|
"cfb 0.14.0",
|
|
@@ -6784,7 +6811,7 @@ dependencies = [
|
|
|
6784
6811
|
"aes",
|
|
6785
6812
|
"arbitrary",
|
|
6786
6813
|
"bzip2 0.5.2",
|
|
6787
|
-
"constant_time_eq",
|
|
6814
|
+
"constant_time_eq 0.3.1",
|
|
6788
6815
|
"crc32fast",
|
|
6789
6816
|
"crossbeam-utils",
|
|
6790
6817
|
"deflate64",
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -930,7 +930,8 @@ module Kreuzberg
|
|
|
930
930
|
:images, :postprocessor,
|
|
931
931
|
:token_reduction, :keywords, :html_options, :pages,
|
|
932
932
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
933
|
-
:security_limits, :layout, :concurrency
|
|
933
|
+
:security_limits, :layout, :concurrency,
|
|
934
|
+
:cache_namespace, :cache_ttl_secs
|
|
934
935
|
|
|
935
936
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
936
937
|
alias image_extraction images
|
|
@@ -955,7 +956,7 @@ module Kreuzberg
|
|
|
955
956
|
language_detection pdf_options image_extraction
|
|
956
957
|
postprocessor token_reduction keywords html_options pages
|
|
957
958
|
max_concurrent_extractions output_format result_format
|
|
958
|
-
security_limits layout concurrency
|
|
959
|
+
security_limits layout concurrency cache_namespace cache_ttl_secs
|
|
959
960
|
].freeze
|
|
960
961
|
|
|
961
962
|
# Aliases for backward compatibility
|
|
@@ -1032,7 +1033,9 @@ module Kreuzberg
|
|
|
1032
1033
|
result_format: nil,
|
|
1033
1034
|
security_limits: nil,
|
|
1034
1035
|
layout: nil,
|
|
1035
|
-
concurrency: nil
|
|
1036
|
+
concurrency: nil,
|
|
1037
|
+
cache_namespace: nil,
|
|
1038
|
+
cache_ttl_secs: nil)
|
|
1036
1039
|
kwargs = {
|
|
1037
1040
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1038
1041
|
force_ocr: force_ocr, include_document_structure: include_document_structure,
|
|
@@ -1043,7 +1046,9 @@ module Kreuzberg
|
|
|
1043
1046
|
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
1044
1047
|
output_format: output_format, result_format: result_format,
|
|
1045
1048
|
security_limits: security_limits, layout: layout,
|
|
1046
|
-
concurrency: concurrency
|
|
1049
|
+
concurrency: concurrency,
|
|
1050
|
+
cache_namespace: cache_namespace,
|
|
1051
|
+
cache_ttl_secs: cache_ttl_secs
|
|
1047
1052
|
}
|
|
1048
1053
|
extracted = extract_from_hash(hash, kwargs)
|
|
1049
1054
|
|
|
@@ -1077,6 +1082,8 @@ module Kreuzberg
|
|
|
1077
1082
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
1078
1083
|
@output_format = validate_output_format(params[:output_format])
|
|
1079
1084
|
@result_format = validate_result_format(params[:result_format])
|
|
1085
|
+
@cache_namespace = params[:cache_namespace]
|
|
1086
|
+
@cache_ttl_secs = params[:cache_ttl_secs]&.to_i
|
|
1080
1087
|
@security_limits = params[:security_limits]
|
|
1081
1088
|
end
|
|
1082
1089
|
|
|
@@ -1112,7 +1119,9 @@ module Kreuzberg
|
|
|
1112
1119
|
include_document_structure: @include_document_structure,
|
|
1113
1120
|
max_concurrent_extractions: @max_concurrent_extractions,
|
|
1114
1121
|
output_format: @output_format,
|
|
1115
|
-
result_format: @result_format
|
|
1122
|
+
result_format: @result_format,
|
|
1123
|
+
cache_namespace: @cache_namespace,
|
|
1124
|
+
cache_ttl_secs: @cache_ttl_secs
|
|
1116
1125
|
}
|
|
1117
1126
|
end
|
|
1118
1127
|
|
|
@@ -1271,6 +1280,10 @@ module Kreuzberg
|
|
|
1271
1280
|
@output_format = validate_output_format(value)
|
|
1272
1281
|
when :result_format
|
|
1273
1282
|
@result_format = validate_result_format(value)
|
|
1283
|
+
when :cache_namespace
|
|
1284
|
+
@cache_namespace = value
|
|
1285
|
+
when :cache_ttl_secs
|
|
1286
|
+
@cache_ttl_secs = value&.to_i
|
|
1274
1287
|
else
|
|
1275
1288
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
1276
1289
|
end
|
|
@@ -1352,6 +1365,8 @@ module Kreuzberg
|
|
|
1352
1365
|
@max_concurrent_extractions = merged.max_concurrent_extractions
|
|
1353
1366
|
@output_format = merged.output_format
|
|
1354
1367
|
@result_format = merged.result_format
|
|
1368
|
+
@cache_namespace = merged.cache_namespace
|
|
1369
|
+
@cache_ttl_secs = merged.cache_ttl_secs
|
|
1355
1370
|
end
|
|
1356
1371
|
end
|
|
1357
1372
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -478,6 +478,8 @@ module Kreuzberg
|
|
|
478
478
|
class Extraction
|
|
479
479
|
attr_reader use_cache: bool
|
|
480
480
|
attr_reader enable_quality_processing: bool
|
|
481
|
+
attr_reader cache_namespace: String?
|
|
482
|
+
attr_reader cache_ttl_secs: Integer?
|
|
481
483
|
attr_reader force_ocr: bool
|
|
482
484
|
attr_reader include_document_structure: bool
|
|
483
485
|
attr_reader ocr: OCR?
|
|
@@ -520,7 +522,9 @@ module Kreuzberg
|
|
|
520
522
|
?concurrency: (Concurrency | Hash[Symbol, untyped])?,
|
|
521
523
|
?max_concurrent_extractions: Integer?,
|
|
522
524
|
?output_format: String?,
|
|
523
|
-
?result_format: String
|
|
525
|
+
?result_format: String?,
|
|
526
|
+
?cache_namespace: String?,
|
|
527
|
+
?cache_ttl_secs: Integer?
|
|
524
528
|
) -> void
|
|
525
529
|
def to_h: () -> Hash[Symbol, untyped]
|
|
526
530
|
def to_json: (*untyped) -> String
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.5.
|
|
5
|
+
version = "4.5.2"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -15,6 +15,7 @@ ahash = "0.8.12"
|
|
|
15
15
|
anyhow = "1.0"
|
|
16
16
|
async-trait = "0.1.89"
|
|
17
17
|
base64 = "0.22.1"
|
|
18
|
+
blake3 = "1"
|
|
18
19
|
bytes = { version = "1", features = ["serde"] }
|
|
19
20
|
chrono = "0.4"
|
|
20
21
|
clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
|
|
@@ -29,8 +30,8 @@ hwpers = "0.5"
|
|
|
29
30
|
image = { version = "0.25.10", default-features = false }
|
|
30
31
|
itertools = "0.14"
|
|
31
32
|
js-sys = "0.3"
|
|
32
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.5.
|
|
33
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.
|
|
33
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.5.2", default-features = false }
|
|
34
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.2" }
|
|
34
35
|
lazy_static = "1.5.0"
|
|
35
36
|
libc = "0.2.183"
|
|
36
37
|
log = "0.4"
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.5.
|
|
3
|
+
version = "4.5.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -192,6 +192,7 @@ pkg-config = "0.3"
|
|
|
192
192
|
[dependencies]
|
|
193
193
|
ahash = "0.8.12"
|
|
194
194
|
async-trait = "0.1.89"
|
|
195
|
+
blake3 = "1"
|
|
195
196
|
base64 = "0.22.1"
|
|
196
197
|
bitvec = "1.0"
|
|
197
198
|
bytes = { version = "1", features = ["serde"] }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.5.
|
|
21
|
+
> **🚀 Version 4.5.2 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|