kreuzberg 4.7.0 → 4.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +25 -24
- data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +6 -6
- data/vendor/kreuzberg/Cargo.toml +7 -6
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/handlers.rs +3 -22
- data/vendor/kreuzberg/src/cache/core.rs +1 -3
- data/vendor/kreuzberg/src/cache_dir.rs +53 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +26 -5
- data/vendor/kreuzberg/src/chunking/processor.rs +9 -0
- data/vendor/kreuzberg/src/core/config/concurrency.rs +3 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +15 -8
- data/vendor/kreuzberg/src/core/extractor/batch.rs +1 -1
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +1 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +80 -1
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +43 -0
- data/vendor/kreuzberg/src/doc_orientation.rs +13 -9
- data/vendor/kreuzberg/src/embeddings/mod.rs +35 -19
- data/vendor/kreuzberg/src/extraction/derive.rs +1 -0
- data/vendor/kreuzberg/src/extraction/image_ocr.rs +7 -12
- data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +1 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +6 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +2 -0
- data/vendor/kreuzberg/src/extractors/odt.rs +21 -0
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +32 -3
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +77 -105
- data/vendor/kreuzberg/src/keywords/processor.rs +7 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +5 -0
- data/vendor/kreuzberg/src/lib.rs +1 -0
- data/vendor/kreuzberg/src/mcp/format.rs +4 -0
- data/vendor/kreuzberg/src/mcp/params.rs +31 -0
- data/vendor/kreuzberg/src/mcp/server.rs +3 -12
- data/vendor/kreuzberg/src/model_download.rs +4 -11
- data/vendor/kreuzberg/src/ocr/cache.rs +13 -10
- data/vendor/kreuzberg/src/ocr/hocr_parser.rs +471 -10
- data/vendor/kreuzberg/src/ocr/layout_assembly.rs +18 -13
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +89 -31
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/tessdata_manager.rs +1 -10
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/ocr/types.rs +1 -0
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +166 -1
- data/vendor/kreuzberg/src/paddle_ocr/config.rs +16 -12
- data/vendor/kreuzberg/src/pdf/layout_runner.rs +48 -0
- data/vendor/kreuzberg/src/pdf/structure/adapters.rs +338 -79
- data/vendor/kreuzberg/src/pdf/structure/assembly.rs +84 -6
- data/vendor/kreuzberg/src/pdf/structure/bridge.rs +1 -1
- data/vendor/kreuzberg/src/pdf/structure/content_convert.rs +0 -3
- data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs +49 -2
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +156 -1
- data/vendor/kreuzberg/src/pdf/structure/text_repair.rs +47 -16
- data/vendor/kreuzberg/src/pdf/table_reconstruct.rs +28 -1
- data/vendor/kreuzberg/src/pdf/text.rs +100 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +1 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +11 -0
- data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +4 -2
- data/vendor/kreuzberg/src/rendering/djot.rs +3 -2
- data/vendor/kreuzberg/src/rendering/markdown.rs +15 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +5 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +28 -10
- data/vendor/kreuzberg/src/types/extraction.rs +8 -0
- data/vendor/kreuzberg/src/types/formats.rs +5 -0
- data/vendor/kreuzberg/tests/config_features.rs +6 -33
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +2 -2
- data/vendor/kreuzberg-ffi/Cargo.toml +4 -4
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +5 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +5 -3
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +7 -4
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +6 -4
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +3 -2
- data/vendor/kreuzberg-ffi/src/result.rs +1 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +2 -0
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/src/angle_net.rs +4 -4
- data/vendor/kreuzberg-paddle-ocr/src/base_net.rs +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/build.rs +1 -1
- data/vendor/kreuzberg-tesseract/src/leptonica.rs +57 -0
- metadata +3 -9
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -68
- data/vendor/kreuzberg/examples/download_paddle_models.rs +0 -358
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg-pdfium-render/examples/artifact_check.rs +0 -70
- data/vendor/kreuzberg-pdfium-render/examples/char_order_check.rs +0 -63
- data/vendor/kreuzberg-pdfium-render/examples/ffi_bench.rs +0 -207
- data/vendor/kreuzberg-pdfium-render/examples/seg_dump.rs +0 -54
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c713d0a652ed4e1752ce0c3f6e044516e870349de149b2fdd5cb6c429c722d65
|
|
4
|
+
data.tar.gz: b519e5b70800b61bd6e5d6acef3a0fee0de8d53f0ad625e863753573f79caf4f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0cec6f964c7975e905997422f296a129fe32974c221300f1d6f89d5f54b27ca48e7343522e961d044cc90d9c7f0f4ef20c42de0c95b20e6794614cd1243bd03b
|
|
7
|
+
data.tar.gz: 8b173be0cb820ade74c4e572465a28257911e9cd1d4ed510f93d26e65529c47f1e637f59a215171b8f931f4a45037bb9b9e5069fd4542b4f911fa36908c65364
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -699,9 +699,9 @@ dependencies = [
|
|
|
699
699
|
|
|
700
700
|
[[package]]
|
|
701
701
|
name = "cc"
|
|
702
|
-
version = "1.2.
|
|
702
|
+
version = "1.2.59"
|
|
703
703
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
704
|
-
checksum = "
|
|
704
|
+
checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283"
|
|
705
705
|
dependencies = [
|
|
706
706
|
"find-msvc-tools",
|
|
707
707
|
"jobserver",
|
|
@@ -928,9 +928,9 @@ checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
|
|
|
928
928
|
|
|
929
929
|
[[package]]
|
|
930
930
|
name = "comrak"
|
|
931
|
-
version = "0.
|
|
931
|
+
version = "0.52.0"
|
|
932
932
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
933
|
-
checksum = "
|
|
933
|
+
checksum = "aac0b255932a9cd52fbfd664b67957f9f2e095ae4711cb0e41b4e291edef94c2"
|
|
934
934
|
dependencies = [
|
|
935
935
|
"caseless",
|
|
936
936
|
"entities",
|
|
@@ -2550,9 +2550,9 @@ checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8"
|
|
|
2550
2550
|
|
|
2551
2551
|
[[package]]
|
|
2552
2552
|
name = "indexmap"
|
|
2553
|
-
version = "2.13.
|
|
2553
|
+
version = "2.13.1"
|
|
2554
2554
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2555
|
-
checksum = "
|
|
2555
|
+
checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff"
|
|
2556
2556
|
dependencies = [
|
|
2557
2557
|
"equivalent",
|
|
2558
2558
|
"hashbrown 0.16.1",
|
|
@@ -2792,7 +2792,7 @@ dependencies = [
|
|
|
2792
2792
|
|
|
2793
2793
|
[[package]]
|
|
2794
2794
|
name = "kreuzberg"
|
|
2795
|
-
version = "4.7.
|
|
2795
|
+
version = "4.7.2"
|
|
2796
2796
|
dependencies = [
|
|
2797
2797
|
"ahash",
|
|
2798
2798
|
"async-trait",
|
|
@@ -2810,6 +2810,7 @@ dependencies = [
|
|
|
2810
2810
|
"comrak",
|
|
2811
2811
|
"dashmap",
|
|
2812
2812
|
"dbase",
|
|
2813
|
+
"dirs",
|
|
2813
2814
|
"encoding_rs",
|
|
2814
2815
|
"fast_image_resize",
|
|
2815
2816
|
"flate2",
|
|
@@ -2884,7 +2885,7 @@ dependencies = [
|
|
|
2884
2885
|
|
|
2885
2886
|
[[package]]
|
|
2886
2887
|
name = "kreuzberg-ffi"
|
|
2887
|
-
version = "4.7.
|
|
2888
|
+
version = "4.7.2"
|
|
2888
2889
|
dependencies = [
|
|
2889
2890
|
"ahash",
|
|
2890
2891
|
"async-trait",
|
|
@@ -2900,7 +2901,7 @@ dependencies = [
|
|
|
2900
2901
|
|
|
2901
2902
|
[[package]]
|
|
2902
2903
|
name = "kreuzberg-paddle-ocr"
|
|
2903
|
-
version = "4.7.
|
|
2904
|
+
version = "4.7.2"
|
|
2904
2905
|
dependencies = [
|
|
2905
2906
|
"geo-clipper",
|
|
2906
2907
|
"geo-types",
|
|
@@ -2914,7 +2915,7 @@ dependencies = [
|
|
|
2914
2915
|
|
|
2915
2916
|
[[package]]
|
|
2916
2917
|
name = "kreuzberg-pdfium-render"
|
|
2917
|
-
version = "4.7.
|
|
2918
|
+
version = "4.7.2"
|
|
2918
2919
|
dependencies = [
|
|
2919
2920
|
"bitflags",
|
|
2920
2921
|
"bytemuck",
|
|
@@ -2937,7 +2938,7 @@ dependencies = [
|
|
|
2937
2938
|
|
|
2938
2939
|
[[package]]
|
|
2939
2940
|
name = "kreuzberg-rb"
|
|
2940
|
-
version = "4.7.
|
|
2941
|
+
version = "4.7.1"
|
|
2941
2942
|
dependencies = [
|
|
2942
2943
|
"async-trait",
|
|
2943
2944
|
"html-to-markdown-rs",
|
|
@@ -2954,7 +2955,7 @@ dependencies = [
|
|
|
2954
2955
|
|
|
2955
2956
|
[[package]]
|
|
2956
2957
|
name = "kreuzberg-tesseract"
|
|
2957
|
-
version = "4.7.
|
|
2958
|
+
version = "4.7.2"
|
|
2958
2959
|
dependencies = [
|
|
2959
2960
|
"cc",
|
|
2960
2961
|
"cmake",
|
|
@@ -4264,18 +4265,18 @@ dependencies = [
|
|
|
4264
4265
|
|
|
4265
4266
|
[[package]]
|
|
4266
4267
|
name = "rb-sys"
|
|
4267
|
-
version = "0.9.
|
|
4268
|
+
version = "0.9.126"
|
|
4268
4269
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4269
|
-
checksum = "
|
|
4270
|
+
checksum = "284799e73e899fe946fd77c7211b83bff61a1356e039ade7a2516a779e3212d0"
|
|
4270
4271
|
dependencies = [
|
|
4271
4272
|
"rb-sys-build",
|
|
4272
4273
|
]
|
|
4273
4274
|
|
|
4274
4275
|
[[package]]
|
|
4275
4276
|
name = "rb-sys-build"
|
|
4276
|
-
version = "0.9.
|
|
4277
|
+
version = "0.9.126"
|
|
4277
4278
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4278
|
-
checksum = "
|
|
4279
|
+
checksum = "855fc1ad8943d12c89ef12f9147f1cc531f5bf19fb744112fdd317bb6ee7b5c5"
|
|
4279
4280
|
dependencies = [
|
|
4280
4281
|
"bindgen",
|
|
4281
4282
|
"lazy_static",
|
|
@@ -4712,9 +4713,9 @@ dependencies = [
|
|
|
4712
4713
|
|
|
4713
4714
|
[[package]]
|
|
4714
4715
|
name = "semver"
|
|
4715
|
-
version = "1.0.
|
|
4716
|
+
version = "1.0.28"
|
|
4716
4717
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4717
|
-
checksum = "
|
|
4718
|
+
checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
|
|
4718
4719
|
|
|
4719
4720
|
[[package]]
|
|
4720
4721
|
name = "seq-macro"
|
|
@@ -5374,9 +5375,9 @@ dependencies = [
|
|
|
5374
5375
|
|
|
5375
5376
|
[[package]]
|
|
5376
5377
|
name = "tokio"
|
|
5377
|
-
version = "1.
|
|
5378
|
+
version = "1.51.0"
|
|
5378
5379
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5379
|
-
checksum = "
|
|
5380
|
+
checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd"
|
|
5380
5381
|
dependencies = [
|
|
5381
5382
|
"bytes",
|
|
5382
5383
|
"libc",
|
|
@@ -5390,9 +5391,9 @@ dependencies = [
|
|
|
5390
5391
|
|
|
5391
5392
|
[[package]]
|
|
5392
5393
|
name = "tokio-macros"
|
|
5393
|
-
version = "2.
|
|
5394
|
+
version = "2.7.0"
|
|
5394
5395
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5395
|
-
checksum = "
|
|
5396
|
+
checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
|
|
5396
5397
|
dependencies = [
|
|
5397
5398
|
"proc-macro2",
|
|
5398
5399
|
"quote",
|
|
@@ -6616,9 +6617,9 @@ dependencies = [
|
|
|
6616
6617
|
|
|
6617
6618
|
[[package]]
|
|
6618
6619
|
name = "writeable"
|
|
6619
|
-
version = "0.6.
|
|
6620
|
+
version = "0.6.3"
|
|
6620
6621
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6621
|
-
checksum = "
|
|
6622
|
+
checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
|
|
6622
6623
|
|
|
6623
6624
|
[[package]]
|
|
6624
6625
|
name = "wyz"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.7.
|
|
3
|
+
version = "4.7.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -49,13 +49,13 @@ kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
|
|
|
49
49
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
50
50
|
"rb-sys",
|
|
51
51
|
] }
|
|
52
|
-
rb-sys = { version = "0.9.
|
|
52
|
+
rb-sys = { version = "0.9.126", default-features = false, features = [
|
|
53
53
|
"stable-api-compiled-fallback",
|
|
54
54
|
] }
|
|
55
55
|
serde_json = "1.0.149"
|
|
56
56
|
toml = "1.1.2"
|
|
57
57
|
serde_yaml_ng = "0.10"
|
|
58
|
-
tokio = { version = "1.
|
|
58
|
+
tokio = { version = "1.51.0", features = [
|
|
59
59
|
"rt",
|
|
60
60
|
"rt-multi-thread",
|
|
61
61
|
"macros",
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.7.
|
|
5
|
+
version = "4.7.2"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -20,7 +20,7 @@ bytes = { version = "1", features = ["serde"] }
|
|
|
20
20
|
cfb = "0.14"
|
|
21
21
|
chrono = "0.4"
|
|
22
22
|
clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
|
|
23
|
-
comrak = { version = "0.
|
|
23
|
+
comrak = { version = "0.52", default-features = false }
|
|
24
24
|
console_error_panic_hook = "0.1"
|
|
25
25
|
criterion = { version = "0.8", features = ["html_reports"] }
|
|
26
26
|
ctor = "0.8"
|
|
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
|
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.7.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.7.2", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.2" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.184"
|
|
39
39
|
log = "0.4"
|
|
@@ -43,7 +43,7 @@ num_cpus = "1.17.0"
|
|
|
43
43
|
once_cell = "1.21.4"
|
|
44
44
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
45
45
|
parking_lot = "0.12.5"
|
|
46
|
-
pdf_oxide = { version = "0.3.
|
|
46
|
+
pdf_oxide = { version = "0.3.19", default-features = false }
|
|
47
47
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
48
48
|
rayon = "1.11.0"
|
|
49
49
|
reqwest = { version = "0.13.2", default-features = false }
|
|
@@ -52,7 +52,7 @@ serde_json = { version = "1.0.149" }
|
|
|
52
52
|
serde_toon_format = "0.1"
|
|
53
53
|
tempfile = "3.27.0"
|
|
54
54
|
thiserror = "2.0.18"
|
|
55
|
-
tokio = { version = "1.
|
|
55
|
+
tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
56
56
|
toml = "1.1.2"
|
|
57
57
|
tracing = "0.1"
|
|
58
58
|
tree-sitter-language-pack = { version = "1.4.1", features = ["serde"], default-features = false }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.7.
|
|
3
|
+
version = "4.7.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -253,9 +253,10 @@ calamine = { version = "0.34.0", features = ["dates"], optional = true }
|
|
|
253
253
|
cfb = { version = "0.14", optional = true }
|
|
254
254
|
chardetng = { version = "1.0.0", optional = true }
|
|
255
255
|
chrono = { version = "0.4", optional = true }
|
|
256
|
-
comrak = { version = "0.
|
|
256
|
+
comrak = { version = "0.52", default-features = false }
|
|
257
257
|
dashmap = "6.1"
|
|
258
258
|
dbase = { version = "0.7", optional = true }
|
|
259
|
+
dirs = "6"
|
|
259
260
|
encoding_rs = { version = "0.8.35" }
|
|
260
261
|
fast_image_resize = { version = "6.0.0", optional = true }
|
|
261
262
|
flate2 = { version = "1.1", optional = true }
|
|
@@ -279,7 +280,7 @@ image = { version = "0.25.10", default-features = false, features = [
|
|
|
279
280
|
"pnm",
|
|
280
281
|
"rayon",
|
|
281
282
|
], optional = true }
|
|
282
|
-
indexmap = "2.13.
|
|
283
|
+
indexmap = "2.13.1"
|
|
283
284
|
infer = "0.19.0"
|
|
284
285
|
jotdown = "0.9"
|
|
285
286
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
@@ -306,7 +307,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
306
307
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
307
308
|
parking_lot = "0.12.5"
|
|
308
309
|
pastey = "0.2"
|
|
309
|
-
pdf_oxide = { version = "0.3.
|
|
310
|
+
pdf_oxide = { version = "0.3.19", default-features = false, optional = true }
|
|
310
311
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
311
312
|
pulldown-cmark = { version = "0.13" }
|
|
312
313
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
|
@@ -341,7 +342,7 @@ tokenizers = { version = "0.22", optional = true, default-features = false, feat
|
|
|
341
342
|
"http",
|
|
342
343
|
"fancy-regex",
|
|
343
344
|
] }
|
|
344
|
-
tokio = { version = "1.
|
|
345
|
+
tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
|
|
345
346
|
toml = "1.1.2"
|
|
346
347
|
tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
|
|
347
348
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
|
|
@@ -400,7 +401,7 @@ jsonschema = "0.45"
|
|
|
400
401
|
serial_test = "3.4.0"
|
|
401
402
|
tar = "0.4.45"
|
|
402
403
|
tempfile = "3.27.0"
|
|
403
|
-
tokio = { version = "1.
|
|
404
|
+
tokio = { version = "1.51.0", features = ["macros", "time"] }
|
|
404
405
|
tokio-test = "0.4"
|
|
405
406
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
406
407
|
zip = { version = ">=7.0.0, <7.4.0", default-features = false, features = ["deflate-flate2"] }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.7.
|
|
21
|
+
> **🚀 Version 4.7.2 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -316,14 +316,7 @@ pub async fn formats_handler() -> Json<Vec<crate::SupportedFormat>> {
|
|
|
316
316
|
)]
|
|
317
317
|
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
|
|
318
318
|
pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
|
|
319
|
-
let cache_dir =
|
|
320
|
-
.map_err(|e| {
|
|
321
|
-
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
322
|
-
"Failed to get current directory: {}",
|
|
323
|
-
e
|
|
324
|
-
)))
|
|
325
|
-
})?
|
|
326
|
-
.join(".kreuzberg");
|
|
319
|
+
let cache_dir = crate::cache_dir::resolve_cache_base();
|
|
327
320
|
|
|
328
321
|
let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
|
|
329
322
|
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
@@ -365,14 +358,7 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
|
|
|
365
358
|
)]
|
|
366
359
|
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
|
|
367
360
|
pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
|
|
368
|
-
let cache_dir =
|
|
369
|
-
.map_err(|e| {
|
|
370
|
-
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
371
|
-
"Failed to get current directory: {}",
|
|
372
|
-
e
|
|
373
|
-
)))
|
|
374
|
-
})?
|
|
375
|
-
.join(".kreuzberg");
|
|
361
|
+
let cache_dir = crate::cache_dir::resolve_cache_base();
|
|
376
362
|
|
|
377
363
|
let cache_dir_str = cache_dir.to_str().ok_or_else(|| {
|
|
378
364
|
ApiError::internal(crate::error::KreuzbergError::Other(format!(
|
|
@@ -932,12 +918,7 @@ pub async fn cache_warm_handler(JsonApi(request): JsonApi<WarmRequest>) -> Resul
|
|
|
932
918
|
|
|
933
919
|
/// Resolve the cache base directory.
|
|
934
920
|
fn resolve_cache_base() -> std::path::PathBuf {
|
|
935
|
-
|
|
936
|
-
return std::path::PathBuf::from(env_path);
|
|
937
|
-
}
|
|
938
|
-
std::env::current_dir()
|
|
939
|
-
.unwrap_or_else(|_| std::path::PathBuf::from("."))
|
|
940
|
-
.join(".kreuzberg")
|
|
921
|
+
crate::cache_dir::resolve_cache_base()
|
|
941
922
|
}
|
|
942
923
|
|
|
943
924
|
#[cfg(test)]
|
|
@@ -66,10 +66,8 @@ impl GenericCache {
|
|
|
66
66
|
) -> Result<Self> {
|
|
67
67
|
let cache_dir_path = if let Some(dir) = cache_dir {
|
|
68
68
|
PathBuf::from(dir).join(&cache_type)
|
|
69
|
-
} else if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
|
|
70
|
-
PathBuf::from(env_path).join(&cache_type)
|
|
71
69
|
} else {
|
|
72
|
-
|
|
70
|
+
crate::cache_dir::resolve_cache_dir(&cache_type)
|
|
73
71
|
};
|
|
74
72
|
|
|
75
73
|
fs::create_dir_all(&cache_dir_path)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
//! Centralized cache directory resolution for all kreuzberg modules.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides a single function that all modules use to determine where to store
|
|
4
|
+
//! cached data (models, OCR results, tessdata, etc.). This avoids per-CWD
|
|
5
|
+
//! `.kreuzberg/` directories and uses platform-appropriate global cache locations.
|
|
6
|
+
|
|
7
|
+
use std::path::PathBuf;
|
|
8
|
+
|
|
9
|
+
/// Resolve the kreuzberg cache base directory (without a module suffix).
|
|
10
|
+
///
|
|
11
|
+
/// Uses the same resolution order as [`resolve_cache_dir`] but returns
|
|
12
|
+
/// the top-level kreuzberg cache directory.
|
|
13
|
+
#[allow(dead_code)]
|
|
14
|
+
pub fn resolve_cache_base() -> PathBuf {
|
|
15
|
+
if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
|
|
16
|
+
return PathBuf::from(env_path);
|
|
17
|
+
}
|
|
18
|
+
if let Some(cache) = dirs::cache_dir() {
|
|
19
|
+
return cache.join("kreuzberg");
|
|
20
|
+
}
|
|
21
|
+
if let Some(home) = dirs::home_dir() {
|
|
22
|
+
return home.join(".cache").join("kreuzberg");
|
|
23
|
+
}
|
|
24
|
+
std::env::current_dir()
|
|
25
|
+
.unwrap_or_else(|_| PathBuf::from("."))
|
|
26
|
+
.join(".kreuzberg")
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/// Resolve the kreuzberg cache directory for a given module.
|
|
30
|
+
///
|
|
31
|
+
/// Resolution order:
|
|
32
|
+
/// 1. `KREUZBERG_CACHE_DIR` env var + `/{module}` (explicit override)
|
|
33
|
+
/// 2. Platform-appropriate global cache directory:
|
|
34
|
+
/// - macOS: `~/Library/Caches/kreuzberg/{module}`
|
|
35
|
+
/// - Linux: `$XDG_CACHE_HOME/kreuzberg/{module}` or `~/.cache/kreuzberg/{module}`
|
|
36
|
+
/// - Windows: `%LOCALAPPDATA%/kreuzberg/{module}`
|
|
37
|
+
/// 3. Home directory fallback: `~/.cache/kreuzberg/{module}`
|
|
38
|
+
/// 4. CWD-relative fallback: `.kreuzberg/{module}` (last resort, e.g. no HOME set)
|
|
39
|
+
pub fn resolve_cache_dir(module: &str) -> PathBuf {
|
|
40
|
+
if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
|
|
41
|
+
return PathBuf::from(env_path).join(module);
|
|
42
|
+
}
|
|
43
|
+
if let Some(cache) = dirs::cache_dir() {
|
|
44
|
+
return cache.join("kreuzberg").join(module);
|
|
45
|
+
}
|
|
46
|
+
if let Some(home) = dirs::home_dir() {
|
|
47
|
+
return home.join(".cache").join("kreuzberg").join(module);
|
|
48
|
+
}
|
|
49
|
+
std::env::current_dir()
|
|
50
|
+
.unwrap_or_else(|_| PathBuf::from("."))
|
|
51
|
+
.join(".kreuzberg")
|
|
52
|
+
.join(module)
|
|
53
|
+
}
|
|
@@ -28,9 +28,9 @@ pub fn validate_page_boundaries(boundaries: &[PageBoundary]) -> Result<()> {
|
|
|
28
28
|
}
|
|
29
29
|
|
|
30
30
|
for (idx, boundary) in boundaries.iter().enumerate() {
|
|
31
|
-
if boundary.byte_start
|
|
31
|
+
if boundary.byte_start > boundary.byte_end {
|
|
32
32
|
return Err(KreuzbergError::validation(format!(
|
|
33
|
-
"Invalid boundary range at index {}: byte_start ({}) must be
|
|
33
|
+
"Invalid boundary range at index {}: byte_start ({}) must be <= byte_end ({})",
|
|
34
34
|
idx, boundary.byte_start, boundary.byte_end
|
|
35
35
|
)));
|
|
36
36
|
}
|
|
@@ -287,6 +287,7 @@ mod tests {
|
|
|
287
287
|
|
|
288
288
|
#[test]
|
|
289
289
|
fn test_chunk_with_same_start_and_end() {
|
|
290
|
+
// Zero-length boundaries are valid (empty pages)
|
|
290
291
|
let boundaries = vec![PageBoundary {
|
|
291
292
|
byte_start: 10,
|
|
292
293
|
byte_end: 10,
|
|
@@ -294,8 +295,28 @@ mod tests {
|
|
|
294
295
|
}];
|
|
295
296
|
|
|
296
297
|
let result = validate_page_boundaries(&boundaries);
|
|
297
|
-
assert!(result.
|
|
298
|
-
|
|
299
|
-
|
|
298
|
+
assert!(result.is_ok());
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
#[test]
|
|
302
|
+
fn test_zero_length_boundary_is_valid() {
|
|
303
|
+
let boundaries = vec![
|
|
304
|
+
PageBoundary {
|
|
305
|
+
byte_start: 0,
|
|
306
|
+
byte_end: 100,
|
|
307
|
+
page_number: 1,
|
|
308
|
+
},
|
|
309
|
+
PageBoundary {
|
|
310
|
+
byte_start: 100,
|
|
311
|
+
byte_end: 100,
|
|
312
|
+
page_number: 2,
|
|
313
|
+
}, // empty page
|
|
314
|
+
PageBoundary {
|
|
315
|
+
byte_start: 100,
|
|
316
|
+
byte_end: 200,
|
|
317
|
+
page_number: 3,
|
|
318
|
+
},
|
|
319
|
+
];
|
|
320
|
+
assert!(validate_page_boundaries(&boundaries).is_ok());
|
|
300
321
|
}
|
|
301
322
|
}
|
|
@@ -146,6 +146,7 @@ mod tests {
|
|
|
146
146
|
#[cfg(feature = "tree-sitter")]
|
|
147
147
|
code_intelligence: None,
|
|
148
148
|
formatted_content: None,
|
|
149
|
+
ocr_internal_document: None,
|
|
149
150
|
};
|
|
150
151
|
|
|
151
152
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -183,6 +184,7 @@ mod tests {
|
|
|
183
184
|
#[cfg(feature = "tree-sitter")]
|
|
184
185
|
code_intelligence: None,
|
|
185
186
|
formatted_content: None,
|
|
187
|
+
ocr_internal_document: None,
|
|
186
188
|
};
|
|
187
189
|
|
|
188
190
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -232,6 +234,7 @@ mod tests {
|
|
|
232
234
|
#[cfg(feature = "tree-sitter")]
|
|
233
235
|
code_intelligence: None,
|
|
234
236
|
formatted_content: None,
|
|
237
|
+
ocr_internal_document: None,
|
|
235
238
|
};
|
|
236
239
|
|
|
237
240
|
let config_with_chunking = ExtractionConfig {
|
|
@@ -277,6 +280,7 @@ mod tests {
|
|
|
277
280
|
#[cfg(feature = "tree-sitter")]
|
|
278
281
|
code_intelligence: None,
|
|
279
282
|
formatted_content: None,
|
|
283
|
+
ocr_internal_document: None,
|
|
280
284
|
};
|
|
281
285
|
|
|
282
286
|
let long_result = ExtractionResult {
|
|
@@ -302,6 +306,7 @@ mod tests {
|
|
|
302
306
|
#[cfg(feature = "tree-sitter")]
|
|
303
307
|
code_intelligence: None,
|
|
304
308
|
formatted_content: None,
|
|
309
|
+
ocr_internal_document: None,
|
|
305
310
|
};
|
|
306
311
|
|
|
307
312
|
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
@@ -356,6 +361,7 @@ mod tests {
|
|
|
356
361
|
#[cfg(feature = "tree-sitter")]
|
|
357
362
|
code_intelligence: None,
|
|
358
363
|
formatted_content: None,
|
|
364
|
+
ocr_internal_document: None,
|
|
359
365
|
};
|
|
360
366
|
|
|
361
367
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -402,6 +408,7 @@ mod tests {
|
|
|
402
408
|
#[cfg(feature = "tree-sitter")]
|
|
403
409
|
code_intelligence: None,
|
|
404
410
|
formatted_content: None,
|
|
411
|
+
ocr_internal_document: None,
|
|
405
412
|
};
|
|
406
413
|
|
|
407
414
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -448,6 +455,7 @@ mod tests {
|
|
|
448
455
|
#[cfg(feature = "tree-sitter")]
|
|
449
456
|
code_intelligence: None,
|
|
450
457
|
formatted_content: None,
|
|
458
|
+
ocr_internal_document: None,
|
|
451
459
|
};
|
|
452
460
|
|
|
453
461
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -494,6 +502,7 @@ mod tests {
|
|
|
494
502
|
#[cfg(feature = "tree-sitter")]
|
|
495
503
|
code_intelligence: None,
|
|
496
504
|
formatted_content: None,
|
|
505
|
+
ocr_internal_document: None,
|
|
497
506
|
};
|
|
498
507
|
|
|
499
508
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -68,7 +68,10 @@ pub fn resolve_thread_budget(config: Option<&ConcurrencyConfig>) -> usize {
|
|
|
68
68
|
/// ```
|
|
69
69
|
pub fn init_thread_pools(budget: usize) {
|
|
70
70
|
POOL_INIT.call_once(|| {
|
|
71
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
71
72
|
rayon::ThreadPoolBuilder::new().num_threads(budget).build_global().ok();
|
|
73
|
+
#[cfg(target_arch = "wasm32")]
|
|
74
|
+
let _ = budget;
|
|
72
75
|
});
|
|
73
76
|
}
|
|
74
77
|
|
|
@@ -404,18 +404,25 @@ impl ExtractionConfig {
|
|
|
404
404
|
/// - Auto-enabling `extract_pages` when `result_format` is `ElementBased`, because
|
|
405
405
|
/// the element transformation requires per-page data to assign correct page numbers.
|
|
406
406
|
/// Without this, all elements would incorrectly get `page_number=1`.
|
|
407
|
+
/// - Auto-enabling `extract_pages` when chunking is configured, because the chunker
|
|
408
|
+
/// needs page boundaries to assign correct page numbers to chunks.
|
|
407
409
|
pub fn normalized(&self) -> std::borrow::Cow<'_, Self> {
|
|
408
|
-
|
|
409
|
-
|
|
410
|
+
let needs_pages = |cfg: &Self| -> bool {
|
|
411
|
+
match &cfg.pages {
|
|
410
412
|
Some(page_config) => !page_config.extract_pages,
|
|
411
413
|
None => true,
|
|
412
|
-
};
|
|
413
|
-
if needs_pages {
|
|
414
|
-
let mut config = self.clone();
|
|
415
|
-
let page_config = config.pages.get_or_insert_with(super::super::page::PageConfig::default);
|
|
416
|
-
page_config.extract_pages = true;
|
|
417
|
-
return std::borrow::Cow::Owned(config);
|
|
418
414
|
}
|
|
415
|
+
};
|
|
416
|
+
|
|
417
|
+
let needs_pages_for_elements =
|
|
418
|
+
self.result_format == crate::types::OutputFormat::ElementBased && needs_pages(self);
|
|
419
|
+
let needs_pages_for_chunking = self.chunking.is_some() && needs_pages(self);
|
|
420
|
+
|
|
421
|
+
if needs_pages_for_elements || needs_pages_for_chunking {
|
|
422
|
+
let mut config = self.clone();
|
|
423
|
+
let page_config = config.pages.get_or_insert_with(super::super::page::PageConfig::default);
|
|
424
|
+
page_config.extract_pages = true;
|
|
425
|
+
return std::borrow::Cow::Owned(config);
|
|
419
426
|
}
|
|
420
427
|
std::borrow::Cow::Borrowed(self)
|
|
421
428
|
}
|
|
@@ -33,7 +33,7 @@ where
|
|
|
33
33
|
let max_concurrent = config
|
|
34
34
|
.max_concurrent_extractions
|
|
35
35
|
.or_else(|| config.concurrency.as_ref().and_then(|c| c.max_threads))
|
|
36
|
-
.unwrap_or_else(||
|
|
36
|
+
.unwrap_or_else(|| crate::core::config::concurrency::resolve_thread_budget(config.concurrency.as_ref()));
|
|
37
37
|
let semaphore = Arc::new(Semaphore::new(max_concurrent));
|
|
38
38
|
|
|
39
39
|
let mut tasks = JoinSet::new();
|