kreuzberg 4.8.0 → 4.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +17 -71
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +6 -6
- data/vendor/kreuzberg/Cargo.toml +8 -8
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/content_filter.rs +13 -6
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +14 -0
- data/vendor/kreuzberg/src/core/config/html_output.rs +136 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +4 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +3 -2
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +64 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +97 -26
- data/vendor/kreuzberg/src/extraction/mod.rs +1 -1
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +5 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +290 -47
- data/vendor/kreuzberg/src/extractors/docbook.rs +44 -8
- data/vendor/kreuzberg/src/extractors/docx.rs +16 -5
- data/vendor/kreuzberg/src/extractors/html.rs +61 -14
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +64 -19
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +20 -5
- data/vendor/kreuzberg/src/extractors/pptx.rs +36 -23
- data/vendor/kreuzberg/src/extractors/rst.rs +33 -8
- data/vendor/kreuzberg/src/keywords/yake/context.rs +55 -0
- data/vendor/kreuzberg/src/keywords/yake/counter.rs +42 -0
- data/vendor/kreuzberg/src/keywords/yake/mod.rs +550 -0
- data/vendor/kreuzberg/src/keywords/yake/plural_helper.rs +40 -0
- data/vendor/kreuzberg/src/keywords/yake/preprocessor.rs +376 -0
- data/vendor/kreuzberg/src/keywords/yake/result_item.rs +113 -0
- data/vendor/kreuzberg/src/keywords/yake/stats.rs +106 -0
- data/vendor/kreuzberg/src/keywords/yake/tag.rs +174 -0
- data/vendor/kreuzberg/src/lib.rs +5 -0
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +133 -0
- data/vendor/kreuzberg/src/pdf/structure/regions/tables.rs +18 -0
- data/vendor/kreuzberg/src/rendering/html_styled.rs +711 -0
- data/vendor/kreuzberg/src/rendering/mod.rs +4 -0
- data/vendor/kreuzberg/tests/api_consistency.rs +2 -0
- data/vendor/kreuzberg/tests/issue_670_content_filter_layout_regression.rs +152 -0
- data/vendor/kreuzberg/tests/issue_671_pptx_image_config_regression.rs +161 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +4 -4
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +14 -3
- data/vendor/kreuzberg/src/keywords/yake.rs +0 -163
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 051fc6ca84b8545cb7049bd336a4897888b98f125fe40bfa24c6ab2e15232114
|
|
4
|
+
data.tar.gz: f64b385b1612258e73a6f771d107375ca1f892af7af199cbbaa8b19f184058df
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 949b5cdda46928b7981d4178efae64d64568155431785520774343fd27b47d2ebc7432d859fd404ff7f405cc93be7dbfda704e618b2aa079ff7d9522f7dac4c4
|
|
7
|
+
data.tar.gz: c6ea82f649544a0db19accb6b4eca03af5e9619321a973fe5cac9ad1524014bb766dc7edcec302bb2bef64c7b7ab0deb0ba1760fe80ccf2b5547c1bc11d4ad29
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.1" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -2167,18 +2167,6 @@ dependencies = [
|
|
|
2167
2167
|
"markup5ever",
|
|
2168
2168
|
]
|
|
2169
2169
|
|
|
2170
|
-
[[package]]
|
|
2171
|
-
name = "htmlize"
|
|
2172
|
-
version = "1.0.6"
|
|
2173
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2174
|
-
checksum = "d347c0de239be20ba0982e4822de3124404281e119ae3e11f5d7425a414e1935"
|
|
2175
|
-
dependencies = [
|
|
2176
|
-
"matchgen",
|
|
2177
|
-
"memchr",
|
|
2178
|
-
"pastey 0.1.1",
|
|
2179
|
-
"serde_json",
|
|
2180
|
-
]
|
|
2181
|
-
|
|
2182
2170
|
[[package]]
|
|
2183
2171
|
name = "http"
|
|
2184
2172
|
version = "1.4.0"
|
|
@@ -2792,7 +2780,7 @@ dependencies = [
|
|
|
2792
2780
|
|
|
2793
2781
|
[[package]]
|
|
2794
2782
|
name = "kreuzberg"
|
|
2795
|
-
version = "4.
|
|
2783
|
+
version = "4.8.1"
|
|
2796
2784
|
dependencies = [
|
|
2797
2785
|
"ahash",
|
|
2798
2786
|
"async-trait",
|
|
@@ -2880,14 +2868,14 @@ dependencies = [
|
|
|
2880
2868
|
"ureq 3.3.0",
|
|
2881
2869
|
"urlencoding",
|
|
2882
2870
|
"utoipa",
|
|
2871
|
+
"v_htmlescape",
|
|
2883
2872
|
"whatlang",
|
|
2884
|
-
"yake-rust",
|
|
2885
2873
|
"zip 8.5.1",
|
|
2886
2874
|
]
|
|
2887
2875
|
|
|
2888
2876
|
[[package]]
|
|
2889
2877
|
name = "kreuzberg-ffi"
|
|
2890
|
-
version = "4.
|
|
2878
|
+
version = "4.8.1"
|
|
2891
2879
|
dependencies = [
|
|
2892
2880
|
"ahash",
|
|
2893
2881
|
"async-trait",
|
|
@@ -2903,7 +2891,7 @@ dependencies = [
|
|
|
2903
2891
|
|
|
2904
2892
|
[[package]]
|
|
2905
2893
|
name = "kreuzberg-paddle-ocr"
|
|
2906
|
-
version = "4.
|
|
2894
|
+
version = "4.8.1"
|
|
2907
2895
|
dependencies = [
|
|
2908
2896
|
"geo-clipper",
|
|
2909
2897
|
"geo-types",
|
|
@@ -2917,7 +2905,7 @@ dependencies = [
|
|
|
2917
2905
|
|
|
2918
2906
|
[[package]]
|
|
2919
2907
|
name = "kreuzberg-pdfium-render"
|
|
2920
|
-
version = "4.
|
|
2908
|
+
version = "4.8.1"
|
|
2921
2909
|
dependencies = [
|
|
2922
2910
|
"bitflags",
|
|
2923
2911
|
"bytemuck",
|
|
@@ -2940,7 +2928,7 @@ dependencies = [
|
|
|
2940
2928
|
|
|
2941
2929
|
[[package]]
|
|
2942
2930
|
name = "kreuzberg-rb"
|
|
2943
|
-
version = "4.
|
|
2931
|
+
version = "4.8.1"
|
|
2944
2932
|
dependencies = [
|
|
2945
2933
|
"async-trait",
|
|
2946
2934
|
"html-to-markdown-rs",
|
|
@@ -2957,7 +2945,7 @@ dependencies = [
|
|
|
2957
2945
|
|
|
2958
2946
|
[[package]]
|
|
2959
2947
|
name = "kreuzberg-tesseract"
|
|
2960
|
-
version = "4.
|
|
2948
|
+
version = "4.8.1"
|
|
2961
2949
|
dependencies = [
|
|
2962
2950
|
"cc",
|
|
2963
2951
|
"cmake",
|
|
@@ -2984,12 +2972,6 @@ version = "0.5.3"
|
|
|
2984
2972
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2985
2973
|
checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8"
|
|
2986
2974
|
|
|
2987
|
-
[[package]]
|
|
2988
|
-
name = "levenshtein"
|
|
2989
|
-
version = "1.0.5"
|
|
2990
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2991
|
-
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
|
|
2992
|
-
|
|
2993
2975
|
[[package]]
|
|
2994
2976
|
name = "libbz2-rs-sys"
|
|
2995
2977
|
version = "0.2.2"
|
|
@@ -3229,12 +3211,6 @@ dependencies = [
|
|
|
3229
3211
|
"web_atoms",
|
|
3230
3212
|
]
|
|
3231
3213
|
|
|
3232
|
-
[[package]]
|
|
3233
|
-
name = "matchgen"
|
|
3234
|
-
version = "0.3.1"
|
|
3235
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3236
|
-
checksum = "e3c6ed90dceac899d670024e99486140739a14a1bda2bd05604689b8979a2894"
|
|
3237
|
-
|
|
3238
3214
|
[[package]]
|
|
3239
3215
|
name = "matchit"
|
|
3240
3216
|
version = "0.8.4"
|
|
@@ -4754,18 +4730,6 @@ dependencies = [
|
|
|
4754
4730
|
"libc",
|
|
4755
4731
|
]
|
|
4756
4732
|
|
|
4757
|
-
[[package]]
|
|
4758
|
-
name = "segtok"
|
|
4759
|
-
version = "0.1.5"
|
|
4760
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4761
|
-
checksum = "80ee4d89b4a3f29cd09fc8dd79c26531298035276cdfd0673ec7e543fff32e51"
|
|
4762
|
-
dependencies = [
|
|
4763
|
-
"either",
|
|
4764
|
-
"fancy-regex",
|
|
4765
|
-
"htmlize",
|
|
4766
|
-
"itertools 0.14.0",
|
|
4767
|
-
]
|
|
4768
|
-
|
|
4769
4733
|
[[package]]
|
|
4770
4734
|
name = "semver"
|
|
4771
4735
|
version = "1.0.28"
|
|
@@ -5097,15 +5061,6 @@ version = "0.1.9"
|
|
|
5097
5061
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5098
5062
|
checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520"
|
|
5099
5063
|
|
|
5100
|
-
[[package]]
|
|
5101
|
-
name = "streaming-stats"
|
|
5102
|
-
version = "0.2.3"
|
|
5103
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5104
|
-
checksum = "b0d670ce4e348a2081843569e0f79b21c99c91bb9028b3b3ecb0f050306de547"
|
|
5105
|
-
dependencies = [
|
|
5106
|
-
"num-traits",
|
|
5107
|
-
]
|
|
5108
|
-
|
|
5109
5064
|
[[package]]
|
|
5110
5065
|
name = "string_cache"
|
|
5111
5066
|
version = "0.9.0"
|
|
@@ -5430,9 +5385,9 @@ dependencies = [
|
|
|
5430
5385
|
|
|
5431
5386
|
[[package]]
|
|
5432
5387
|
name = "tokio"
|
|
5433
|
-
version = "1.51.
|
|
5388
|
+
version = "1.51.1"
|
|
5434
5389
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5435
|
-
checksum = "
|
|
5390
|
+
checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c"
|
|
5436
5391
|
dependencies = [
|
|
5437
5392
|
"bytes",
|
|
5438
5393
|
"libc",
|
|
@@ -5710,9 +5665,9 @@ checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782"
|
|
|
5710
5665
|
|
|
5711
5666
|
[[package]]
|
|
5712
5667
|
name = "tree-sitter-language-pack"
|
|
5713
|
-
version = "1.4.
|
|
5668
|
+
version = "1.4.2"
|
|
5714
5669
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5715
|
-
checksum = "
|
|
5670
|
+
checksum = "6ed9cacce88ea8b3a92813649012c1ef387bc864f1dd8398843e8b9e076233b0"
|
|
5716
5671
|
dependencies = [
|
|
5717
5672
|
"ahash",
|
|
5718
5673
|
"cc",
|
|
@@ -6007,6 +5962,12 @@ dependencies = [
|
|
|
6007
5962
|
"wasm-bindgen",
|
|
6008
5963
|
]
|
|
6009
5964
|
|
|
5965
|
+
[[package]]
|
|
5966
|
+
name = "v_htmlescape"
|
|
5967
|
+
version = "0.15.8"
|
|
5968
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5969
|
+
checksum = "4e8257fbc510f0a46eb602c10215901938b5c2a7d5e70fc11483b1d3c9b5b18c"
|
|
5970
|
+
|
|
6010
5971
|
[[package]]
|
|
6011
5972
|
name = "valuable"
|
|
6012
5973
|
version = "0.1.1"
|
|
@@ -6715,21 +6676,6 @@ version = "0.8.0"
|
|
|
6715
6676
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6716
6677
|
checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448"
|
|
6717
6678
|
|
|
6718
|
-
[[package]]
|
|
6719
|
-
name = "yake-rust"
|
|
6720
|
-
version = "1.0.3"
|
|
6721
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6722
|
-
checksum = "9ce4fc48712dfe1427747a6b1f2bb1ca7b9abb7168fbc086774f890b6e4e097f"
|
|
6723
|
-
dependencies = [
|
|
6724
|
-
"hashbrown 0.15.5",
|
|
6725
|
-
"indexmap",
|
|
6726
|
-
"itertools 0.14.0",
|
|
6727
|
-
"levenshtein",
|
|
6728
|
-
"regex",
|
|
6729
|
-
"segtok",
|
|
6730
|
-
"streaming-stats",
|
|
6731
|
-
]
|
|
6732
|
-
|
|
6733
6679
|
[[package]]
|
|
6734
6680
|
name = "yansi"
|
|
6735
6681
|
version = "1.0.1"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.8.
|
|
3
|
+
version = "4.8.1"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -55,7 +55,7 @@ rb-sys = { version = "0.9.126", default-features = false, features = [
|
|
|
55
55
|
serde_json = "1.0.149"
|
|
56
56
|
toml = "1.1.2"
|
|
57
57
|
serde_yaml_ng = "0.10"
|
|
58
|
-
tokio = { version = "1.51.
|
|
58
|
+
tokio = { version = "1.51.1", features = [
|
|
59
59
|
"rt",
|
|
60
60
|
"rt-multi-thread",
|
|
61
61
|
"macros",
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.8.
|
|
5
|
+
version = "4.8.1"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
|
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.8.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.8.1", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.1" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.184"
|
|
39
39
|
liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
|
|
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
|
|
|
45
45
|
once_cell = "1.21.4"
|
|
46
46
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
47
47
|
parking_lot = "0.12.5"
|
|
48
|
-
pdf_oxide = { version = "0.3.
|
|
48
|
+
pdf_oxide = { version = "0.3.22", default-features = false }
|
|
49
49
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
50
50
|
rayon = "1.11.0"
|
|
51
51
|
reqwest = { version = "0.13.2", default-features = false }
|
|
@@ -54,10 +54,10 @@ serde_json = { version = "1.0.149" }
|
|
|
54
54
|
serde_toon_format = "0.1"
|
|
55
55
|
tempfile = "3.27.0"
|
|
56
56
|
thiserror = "2.0.18"
|
|
57
|
-
tokio = { version = "1.51.
|
|
57
|
+
tokio = { version = "1.51.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
58
58
|
toml = "1.1.2"
|
|
59
59
|
tracing = "0.1"
|
|
60
|
-
tree-sitter-language-pack = { version = "1.4.
|
|
60
|
+
tree-sitter-language-pack = { version = "1.4.2", features = ["serde"], default-features = false }
|
|
61
61
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
|
62
62
|
wasm-bindgen-futures = "0.4"
|
|
63
63
|
web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.8.
|
|
3
|
+
version = "4.8.1"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -62,7 +62,7 @@ office = [
|
|
|
62
62
|
hwp = ["dep:cfb", "dep:flate2"]
|
|
63
63
|
iwork = ["dep:zip", "dep:snap"]
|
|
64
64
|
email = ["dep:mail-parser", "dep:cfb", "dep:outlook-pst", "dep:tempfile", "dep:chrono"]
|
|
65
|
-
html = ["dep:html-to-markdown-rs"]
|
|
65
|
+
html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"]
|
|
66
66
|
xml = ["dep:quick-xml", "dep:roxmltree"]
|
|
67
67
|
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
|
|
68
68
|
mdx = []
|
|
@@ -144,7 +144,7 @@ embeddings = [
|
|
|
144
144
|
stopwords = []
|
|
145
145
|
quality = ["dep:unicode-normalization", "dep:chardetng", "stopwords"]
|
|
146
146
|
|
|
147
|
-
keywords-yake = ["
|
|
147
|
+
keywords-yake = ["stopwords"]
|
|
148
148
|
keywords-rake = ["dep:rake", "stopwords"]
|
|
149
149
|
keywords = ["keywords-yake", "keywords-rake"]
|
|
150
150
|
|
|
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
314
314
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
315
315
|
parking_lot = "0.12.5"
|
|
316
316
|
pastey = "0.2"
|
|
317
|
-
pdf_oxide = { version = "0.3.
|
|
317
|
+
pdf_oxide = { version = "0.3.22", default-features = false, optional = true }
|
|
318
318
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
319
319
|
pulldown-cmark = { version = "0.13" }
|
|
320
320
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
|
@@ -349,7 +349,7 @@ tokenizers = { version = "0.22", optional = true, default-features = false, feat
|
|
|
349
349
|
"http",
|
|
350
350
|
"fancy-regex",
|
|
351
351
|
] }
|
|
352
|
-
tokio = { version = "1.51.
|
|
352
|
+
tokio = { version = "1.51.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
|
|
353
353
|
toml = "1.1.2"
|
|
354
354
|
tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
|
|
355
355
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
|
|
@@ -358,8 +358,8 @@ tracing-opentelemetry = { version = "0.32", optional = true }
|
|
|
358
358
|
unicode-normalization = { version = "0.1.25", optional = true }
|
|
359
359
|
urlencoding = "2"
|
|
360
360
|
utoipa = { version = "5.4", features = ["axum_extras"], optional = true }
|
|
361
|
+
v_htmlescape = { version = "0.15", optional = true }
|
|
361
362
|
whatlang = { version = "0.18.0", optional = true }
|
|
362
|
-
yake-rust = { version = "1.0.3", optional = true }
|
|
363
363
|
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
|
|
364
364
|
"deflate-flate2",
|
|
365
365
|
] }
|
|
@@ -392,7 +392,7 @@ optional = true
|
|
|
392
392
|
# Override getrandom to enable js feature for WASM targets
|
|
393
393
|
# This is needed because ring/rustls (via ureq) depend on getrandom without js feature
|
|
394
394
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
395
|
-
tree-sitter-language-pack = { version = "1.4.
|
|
395
|
+
tree-sitter-language-pack = { version = "1.4.2", features = ["wasm", "serde"], default-features = false, optional = true }
|
|
396
396
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
|
397
397
|
|
|
398
398
|
[build-dependencies]
|
|
@@ -409,7 +409,7 @@ jsonschema = "0.45"
|
|
|
409
409
|
serial_test = "3.4.0"
|
|
410
410
|
tar = "0.4.45"
|
|
411
411
|
tempfile = "3.27.0"
|
|
412
|
-
tokio = { version = "1.51.
|
|
412
|
+
tokio = { version = "1.51.1", features = ["macros", "time"] }
|
|
413
413
|
tokio-test = "0.4"
|
|
414
414
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
415
415
|
zip = { version = ">=7.0.0, <8.6.0", default-features = false, features = ["deflate-flate2"] }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.8.
|
|
21
|
+
> **🚀 Version 4.8.1 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -19,7 +19,8 @@ fn default_true() -> bool {
|
|
|
19
19
|
pub struct ContentFilterConfig {
|
|
20
20
|
/// Include running headers in extraction output.
|
|
21
21
|
///
|
|
22
|
-
/// - PDF: Disables top-margin furniture stripping
|
|
22
|
+
/// - PDF: Disables top-margin furniture stripping and prevents the layout
|
|
23
|
+
/// model from treating `PageHeader`-classified regions as furniture.
|
|
23
24
|
/// - DOCX: Includes document headers in text output.
|
|
24
25
|
/// - RTF/ODT: Headers already included; this is a no-op when true.
|
|
25
26
|
/// - HTML/EPUB: Keeps `<header>` element content.
|
|
@@ -30,7 +31,8 @@ pub struct ContentFilterConfig {
|
|
|
30
31
|
|
|
31
32
|
/// Include running footers in extraction output.
|
|
32
33
|
///
|
|
33
|
-
/// - PDF: Disables bottom-margin furniture stripping
|
|
34
|
+
/// - PDF: Disables bottom-margin furniture stripping and prevents the layout
|
|
35
|
+
/// model from treating `PageFooter`-classified regions as furniture.
|
|
34
36
|
/// - DOCX: Includes document footers in text output.
|
|
35
37
|
/// - RTF/ODT: Footers already included; this is a no-op when true.
|
|
36
38
|
/// - HTML/EPUB: Keeps `<footer>` element content.
|
|
@@ -39,11 +41,16 @@ pub struct ContentFilterConfig {
|
|
|
39
41
|
#[serde(default)]
|
|
40
42
|
pub include_footers: bool,
|
|
41
43
|
|
|
42
|
-
/// Enable cross-page repeating text
|
|
44
|
+
/// Enable the heuristic cross-page repeating text detector.
|
|
43
45
|
///
|
|
44
|
-
/// When `true` (default), text that repeats verbatim across a
|
|
45
|
-
///
|
|
46
|
-
/// or repeated headings are being incorrectly removed.
|
|
46
|
+
/// When `true` (default), text that repeats verbatim across a supermajority
|
|
47
|
+
/// of pages is classified as furniture and stripped. Disable this if brand
|
|
48
|
+
/// names or repeated headings are being incorrectly removed by the heuristic.
|
|
49
|
+
///
|
|
50
|
+
/// Note: when a layout-detection model is active, the model may independently
|
|
51
|
+
/// classify page-header / page-footer regions as furniture on a per-page basis.
|
|
52
|
+
/// To preserve those regions, set `include_headers = true` and/or
|
|
53
|
+
/// `include_footers = true` in addition to disabling this flag.
|
|
47
54
|
///
|
|
48
55
|
/// Primarily affects PDF extraction.
|
|
49
56
|
///
|
|
@@ -121,6 +121,18 @@ pub struct ExtractionConfig {
|
|
|
121
121
|
#[serde(default)]
|
|
122
122
|
pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
|
|
123
123
|
|
|
124
|
+
/// Styled HTML output configuration.
|
|
125
|
+
///
|
|
126
|
+
/// When set alongside `output_format = OutputFormat::Html`, the extraction
|
|
127
|
+
/// pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
|
|
128
|
+
/// which emits stable `kb-*` CSS class hooks on every structural element
|
|
129
|
+
/// and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
|
|
130
|
+
///
|
|
131
|
+
/// When `None`, the existing plain comrak-based HTML renderer is used.
|
|
132
|
+
#[cfg(feature = "html")]
|
|
133
|
+
#[serde(default)]
|
|
134
|
+
pub html_output: Option<crate::core::config::html_output::HtmlOutputConfig>,
|
|
135
|
+
|
|
124
136
|
/// Default per-file timeout in seconds for batch extraction.
|
|
125
137
|
///
|
|
126
138
|
/// When set, each file in a batch will be canceled after this duration
|
|
@@ -272,6 +284,8 @@ impl Default for ExtractionConfig {
|
|
|
272
284
|
postprocessor: None,
|
|
273
285
|
#[cfg(feature = "html")]
|
|
274
286
|
html_options: None,
|
|
287
|
+
#[cfg(feature = "html")]
|
|
288
|
+
html_output: None,
|
|
275
289
|
extraction_timeout_secs: None,
|
|
276
290
|
max_concurrent_extractions: None,
|
|
277
291
|
#[cfg(feature = "archives")]
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
//! HTML output configuration.
|
|
2
|
+
//!
|
|
3
|
+
//! Controls how `OutputFormat::Html` renders an `InternalDocument`:
|
|
4
|
+
//! which built-in theme to use, whether to embed the CSS in a `<style>`
|
|
5
|
+
//! block, and optional user-supplied CSS (inline string or file path).
|
|
6
|
+
|
|
7
|
+
use std::path::PathBuf;
|
|
8
|
+
|
|
9
|
+
use serde::{Deserialize, Serialize};
|
|
10
|
+
|
|
11
|
+
fn default_class_prefix() -> String {
|
|
12
|
+
"kb-".to_string()
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
fn default_true() -> bool {
|
|
16
|
+
true
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/// Configuration for styled HTML output.
|
|
20
|
+
///
|
|
21
|
+
/// When set on [`ExtractionConfig::html_output`] alongside
|
|
22
|
+
/// `output_format = OutputFormat::Html`, the pipeline builds a
|
|
23
|
+
/// [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
|
|
24
|
+
/// the plain comrak-based renderer.
|
|
25
|
+
///
|
|
26
|
+
/// # Example
|
|
27
|
+
///
|
|
28
|
+
/// ```rust
|
|
29
|
+
/// use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
|
|
30
|
+
///
|
|
31
|
+
/// let config = HtmlOutputConfig {
|
|
32
|
+
/// theme: HtmlTheme::GitHub,
|
|
33
|
+
/// css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
|
|
34
|
+
/// ..Default::default()
|
|
35
|
+
/// };
|
|
36
|
+
/// ```
|
|
37
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
38
|
+
pub struct HtmlOutputConfig {
|
|
39
|
+
/// Inline CSS string injected into the output after the theme stylesheet.
|
|
40
|
+
/// Concatenated after `css_file` content when both are set.
|
|
41
|
+
#[serde(default)]
|
|
42
|
+
pub css: Option<String>,
|
|
43
|
+
|
|
44
|
+
/// Path to a CSS file loaded once at renderer construction time.
|
|
45
|
+
/// Concatenated before `css` when both are set.
|
|
46
|
+
#[serde(default)]
|
|
47
|
+
pub css_file: Option<PathBuf>,
|
|
48
|
+
|
|
49
|
+
/// Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
|
|
50
|
+
#[serde(default)]
|
|
51
|
+
pub theme: HtmlTheme,
|
|
52
|
+
|
|
53
|
+
/// CSS class prefix applied to every emitted class name.
|
|
54
|
+
///
|
|
55
|
+
/// Default: `"kb-"`. Change this if your host application already uses
|
|
56
|
+
/// classes that start with `kb-`.
|
|
57
|
+
#[serde(default = "default_class_prefix")]
|
|
58
|
+
pub class_prefix: String,
|
|
59
|
+
|
|
60
|
+
/// When `true` (default), write the resolved CSS into a `<style>` block
|
|
61
|
+
/// immediately after the opening `<div class="{prefix}doc">`.
|
|
62
|
+
///
|
|
63
|
+
/// Set to `false` to emit only the structural markup and wire up your
|
|
64
|
+
/// own stylesheet targeting the `kb-*` class names.
|
|
65
|
+
#[serde(default = "default_true")]
|
|
66
|
+
pub embed_css: bool,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
impl Default for HtmlOutputConfig {
|
|
70
|
+
fn default() -> Self {
|
|
71
|
+
Self {
|
|
72
|
+
css: None,
|
|
73
|
+
css_file: None,
|
|
74
|
+
theme: HtmlTheme::Unstyled,
|
|
75
|
+
class_prefix: default_class_prefix(),
|
|
76
|
+
embed_css: true,
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Built-in HTML theme selection.
|
|
82
|
+
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
|
|
83
|
+
#[serde(rename_all = "lowercase")]
|
|
84
|
+
pub enum HtmlTheme {
|
|
85
|
+
/// Sensible defaults: system font stack, neutral colours, readable line
|
|
86
|
+
/// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
|
|
87
|
+
/// can override individual values.
|
|
88
|
+
Default,
|
|
89
|
+
/// GitHub Markdown-inspired palette and spacing.
|
|
90
|
+
GitHub,
|
|
91
|
+
/// Dark background, light text.
|
|
92
|
+
Dark,
|
|
93
|
+
/// Minimal light theme with generous whitespace.
|
|
94
|
+
Light,
|
|
95
|
+
/// No built-in stylesheet emitted. CSS custom properties are still defined
|
|
96
|
+
/// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
|
|
97
|
+
#[default]
|
|
98
|
+
Unstyled,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
#[cfg(test)]
|
|
102
|
+
mod tests {
|
|
103
|
+
use super::*;
|
|
104
|
+
|
|
105
|
+
#[test]
|
|
106
|
+
fn default_config_values() {
|
|
107
|
+
let cfg = HtmlOutputConfig::default();
|
|
108
|
+
assert_eq!(cfg.class_prefix, "kb-");
|
|
109
|
+
assert!(cfg.embed_css);
|
|
110
|
+
assert!(cfg.css.is_none());
|
|
111
|
+
assert!(cfg.css_file.is_none());
|
|
112
|
+
assert_eq!(cfg.theme, HtmlTheme::Unstyled);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
#[test]
|
|
116
|
+
fn serde_roundtrip() {
|
|
117
|
+
let cfg = HtmlOutputConfig {
|
|
118
|
+
css: Some(".kb-p { color: red; }".to_string()),
|
|
119
|
+
theme: HtmlTheme::GitHub,
|
|
120
|
+
embed_css: false,
|
|
121
|
+
..Default::default()
|
|
122
|
+
};
|
|
123
|
+
let json = serde_json::to_string(&cfg).unwrap();
|
|
124
|
+
let back: HtmlOutputConfig = serde_json::from_str(&json).unwrap();
|
|
125
|
+
assert_eq!(back.css, cfg.css);
|
|
126
|
+
assert_eq!(back.theme, HtmlTheme::GitHub);
|
|
127
|
+
assert!(!back.embed_css);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
#[test]
|
|
131
|
+
fn theme_serde() {
|
|
132
|
+
assert_eq!(serde_json::to_string(&HtmlTheme::GitHub).unwrap(), "\"github\"");
|
|
133
|
+
let t: HtmlTheme = serde_json::from_str("\"dark\"").unwrap();
|
|
134
|
+
assert_eq!(t, HtmlTheme::Dark);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
@@ -9,6 +9,8 @@ pub mod content_filter;
|
|
|
9
9
|
pub mod email;
|
|
10
10
|
pub mod extraction;
|
|
11
11
|
pub mod formats;
|
|
12
|
+
#[cfg(feature = "html")]
|
|
13
|
+
pub mod html_output;
|
|
12
14
|
pub mod layout;
|
|
13
15
|
pub mod llm;
|
|
14
16
|
pub mod merge;
|
|
@@ -28,6 +30,8 @@ pub use extraction::{
|
|
|
28
30
|
ExtractionConfig, FileExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, TokenReductionConfig,
|
|
29
31
|
};
|
|
30
32
|
pub use formats::OutputFormat;
|
|
33
|
+
#[cfg(feature = "html")]
|
|
34
|
+
pub use html_output::{HtmlOutputConfig, HtmlTheme};
|
|
31
35
|
#[cfg(feature = "layout-detection")]
|
|
32
36
|
pub use layout::{LayoutDetectionConfig, TableModel};
|
|
33
37
|
pub use llm::{LlmConfig, StructuredExtractionConfig};
|
|
@@ -40,7 +40,7 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
|
|
|
40
40
|
// Try exact match first
|
|
41
41
|
if let Some(pos) = content[search_offset..].find(&page.content) {
|
|
42
42
|
let byte_start = search_offset + pos;
|
|
43
|
-
let byte_end = byte_start + page.content.len();
|
|
43
|
+
let byte_end = content.floor_char_boundary(byte_start + page.content.len());
|
|
44
44
|
boundaries.push(PageBoundary {
|
|
45
45
|
page_number: page.page_number,
|
|
46
46
|
byte_start,
|
|
@@ -55,7 +55,8 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
|
|
|
55
55
|
&& let Some(pos) = content[search_offset..].find(line)
|
|
56
56
|
{
|
|
57
57
|
let byte_start = search_offset + pos;
|
|
58
|
-
let
|
|
58
|
+
let raw_end = (byte_start + page.content.len()).min(content.len());
|
|
59
|
+
let byte_end = content.floor_char_boundary(raw_end);
|
|
59
60
|
boundaries.push(PageBoundary {
|
|
60
61
|
page_number: page.page_number,
|
|
61
62
|
byte_start,
|