kreuzberg 4.8.0 → 4.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.lock +17 -71
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +6 -6
  7. data/vendor/kreuzberg/Cargo.toml +8 -8
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/content_filter.rs +13 -6
  10. data/vendor/kreuzberg/src/core/config/extraction/core.rs +14 -0
  11. data/vendor/kreuzberg/src/core/config/html_output.rs +136 -0
  12. data/vendor/kreuzberg/src/core/config/mod.rs +4 -0
  13. data/vendor/kreuzberg/src/core/pipeline/features.rs +3 -2
  14. data/vendor/kreuzberg/src/core/pipeline/mod.rs +64 -0
  15. data/vendor/kreuzberg/src/core/pipeline/tests.rs +1 -1
  16. data/vendor/kreuzberg/src/extraction/docx/parser.rs +97 -26
  17. data/vendor/kreuzberg/src/extraction/mod.rs +1 -1
  18. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +5 -0
  19. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +290 -47
  20. data/vendor/kreuzberg/src/extractors/docbook.rs +44 -8
  21. data/vendor/kreuzberg/src/extractors/docx.rs +16 -5
  22. data/vendor/kreuzberg/src/extractors/html.rs +61 -14
  23. data/vendor/kreuzberg/src/extractors/latex/mod.rs +64 -19
  24. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +20 -5
  25. data/vendor/kreuzberg/src/extractors/pptx.rs +36 -23
  26. data/vendor/kreuzberg/src/extractors/rst.rs +33 -8
  27. data/vendor/kreuzberg/src/keywords/yake/context.rs +55 -0
  28. data/vendor/kreuzberg/src/keywords/yake/counter.rs +42 -0
  29. data/vendor/kreuzberg/src/keywords/yake/mod.rs +550 -0
  30. data/vendor/kreuzberg/src/keywords/yake/plural_helper.rs +40 -0
  31. data/vendor/kreuzberg/src/keywords/yake/preprocessor.rs +376 -0
  32. data/vendor/kreuzberg/src/keywords/yake/result_item.rs +113 -0
  33. data/vendor/kreuzberg/src/keywords/yake/stats.rs +106 -0
  34. data/vendor/kreuzberg/src/keywords/yake/tag.rs +174 -0
  35. data/vendor/kreuzberg/src/lib.rs +5 -0
  36. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +133 -0
  37. data/vendor/kreuzberg/src/pdf/structure/regions/tables.rs +18 -0
  38. data/vendor/kreuzberg/src/rendering/html_styled.rs +711 -0
  39. data/vendor/kreuzberg/src/rendering/mod.rs +4 -0
  40. data/vendor/kreuzberg/tests/api_consistency.rs +2 -0
  41. data/vendor/kreuzberg/tests/issue_670_content_filter_layout_regression.rs +152 -0
  42. data/vendor/kreuzberg/tests/issue_671_pptx_image_config_regression.rs +161 -0
  43. data/vendor/kreuzberg-ffi/Cargo.toml +4 -4
  44. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  45. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  46. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  47. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  48. metadata +14 -3
  49. data/vendor/kreuzberg/src/keywords/yake.rs +0 -163
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c768f2adf392c1598da39e79b20dcc7f0c55774f2d3063c74c6ec72888742bac
4
- data.tar.gz: 8a350998762668be79e7f4a3843812782f4b0812fdb41327d8d38f1f57c39073
3
+ metadata.gz: 051fc6ca84b8545cb7049bd336a4897888b98f125fe40bfa24c6ab2e15232114
4
+ data.tar.gz: f64b385b1612258e73a6f771d107375ca1f892af7af199cbbaa8b19f184058df
5
5
  SHA512:
6
- metadata.gz: 18b2dc0c39199bead5f1f310a38b5f288cba0d8ebb62d31505f13933a0af3bf9179f8377e7a8bd605c39d46e900869a4492f1bd829f853d6f1765263f4affd4d
7
- data.tar.gz: 769b9c36d5f6722f97f3c3aebb97f1045382a6dbca96405e298c44bc268795d97781edb89d140c1bc2f0e1fb33c5efecb79c2e500469dd7a25fe2105c49081ba
6
+ metadata.gz: 949b5cdda46928b7981d4178efae64d64568155431785520774343fd27b47d2ebc7432d859fd404ff7f405cc93be7dbfda704e618b2aa079ff7d9522f7dac4c4
7
+ data.tar.gz: c6ea82f649544a0db19accb6b4eca03af5e9619321a973fe5cac9ad1524014bb766dc7edcec302bb2bef64c7b7ab0deb0ba1760fe80ccf2b5547c1bc11d4ad29
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.1" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -2167,18 +2167,6 @@ dependencies = [
2167
2167
  "markup5ever",
2168
2168
  ]
2169
2169
 
2170
- [[package]]
2171
- name = "htmlize"
2172
- version = "1.0.6"
2173
- source = "registry+https://github.com/rust-lang/crates.io-index"
2174
- checksum = "d347c0de239be20ba0982e4822de3124404281e119ae3e11f5d7425a414e1935"
2175
- dependencies = [
2176
- "matchgen",
2177
- "memchr",
2178
- "pastey 0.1.1",
2179
- "serde_json",
2180
- ]
2181
-
2182
2170
  [[package]]
2183
2171
  name = "http"
2184
2172
  version = "1.4.0"
@@ -2792,7 +2780,7 @@ dependencies = [
2792
2780
 
2793
2781
  [[package]]
2794
2782
  name = "kreuzberg"
2795
- version = "4.7.4"
2783
+ version = "4.8.1"
2796
2784
  dependencies = [
2797
2785
  "ahash",
2798
2786
  "async-trait",
@@ -2880,14 +2868,14 @@ dependencies = [
2880
2868
  "ureq 3.3.0",
2881
2869
  "urlencoding",
2882
2870
  "utoipa",
2871
+ "v_htmlescape",
2883
2872
  "whatlang",
2884
- "yake-rust",
2885
2873
  "zip 8.5.1",
2886
2874
  ]
2887
2875
 
2888
2876
  [[package]]
2889
2877
  name = "kreuzberg-ffi"
2890
- version = "4.7.4"
2878
+ version = "4.8.1"
2891
2879
  dependencies = [
2892
2880
  "ahash",
2893
2881
  "async-trait",
@@ -2903,7 +2891,7 @@ dependencies = [
2903
2891
 
2904
2892
  [[package]]
2905
2893
  name = "kreuzberg-paddle-ocr"
2906
- version = "4.7.4"
2894
+ version = "4.8.1"
2907
2895
  dependencies = [
2908
2896
  "geo-clipper",
2909
2897
  "geo-types",
@@ -2917,7 +2905,7 @@ dependencies = [
2917
2905
 
2918
2906
  [[package]]
2919
2907
  name = "kreuzberg-pdfium-render"
2920
- version = "4.7.4"
2908
+ version = "4.8.1"
2921
2909
  dependencies = [
2922
2910
  "bitflags",
2923
2911
  "bytemuck",
@@ -2940,7 +2928,7 @@ dependencies = [
2940
2928
 
2941
2929
  [[package]]
2942
2930
  name = "kreuzberg-rb"
2943
- version = "4.7.4"
2931
+ version = "4.8.1"
2944
2932
  dependencies = [
2945
2933
  "async-trait",
2946
2934
  "html-to-markdown-rs",
@@ -2957,7 +2945,7 @@ dependencies = [
2957
2945
 
2958
2946
  [[package]]
2959
2947
  name = "kreuzberg-tesseract"
2960
- version = "4.7.4"
2948
+ version = "4.8.1"
2961
2949
  dependencies = [
2962
2950
  "cc",
2963
2951
  "cmake",
@@ -2984,12 +2972,6 @@ version = "0.5.3"
2984
2972
  source = "registry+https://github.com/rust-lang/crates.io-index"
2985
2973
  checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8"
2986
2974
 
2987
- [[package]]
2988
- name = "levenshtein"
2989
- version = "1.0.5"
2990
- source = "registry+https://github.com/rust-lang/crates.io-index"
2991
- checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
2992
-
2993
2975
  [[package]]
2994
2976
  name = "libbz2-rs-sys"
2995
2977
  version = "0.2.2"
@@ -3229,12 +3211,6 @@ dependencies = [
3229
3211
  "web_atoms",
3230
3212
  ]
3231
3213
 
3232
- [[package]]
3233
- name = "matchgen"
3234
- version = "0.3.1"
3235
- source = "registry+https://github.com/rust-lang/crates.io-index"
3236
- checksum = "e3c6ed90dceac899d670024e99486140739a14a1bda2bd05604689b8979a2894"
3237
-
3238
3214
  [[package]]
3239
3215
  name = "matchit"
3240
3216
  version = "0.8.4"
@@ -4754,18 +4730,6 @@ dependencies = [
4754
4730
  "libc",
4755
4731
  ]
4756
4732
 
4757
- [[package]]
4758
- name = "segtok"
4759
- version = "0.1.5"
4760
- source = "registry+https://github.com/rust-lang/crates.io-index"
4761
- checksum = "80ee4d89b4a3f29cd09fc8dd79c26531298035276cdfd0673ec7e543fff32e51"
4762
- dependencies = [
4763
- "either",
4764
- "fancy-regex",
4765
- "htmlize",
4766
- "itertools 0.14.0",
4767
- ]
4768
-
4769
4733
  [[package]]
4770
4734
  name = "semver"
4771
4735
  version = "1.0.28"
@@ -5097,15 +5061,6 @@ version = "0.1.9"
5097
5061
  source = "registry+https://github.com/rust-lang/crates.io-index"
5098
5062
  checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520"
5099
5063
 
5100
- [[package]]
5101
- name = "streaming-stats"
5102
- version = "0.2.3"
5103
- source = "registry+https://github.com/rust-lang/crates.io-index"
5104
- checksum = "b0d670ce4e348a2081843569e0f79b21c99c91bb9028b3b3ecb0f050306de547"
5105
- dependencies = [
5106
- "num-traits",
5107
- ]
5108
-
5109
5064
  [[package]]
5110
5065
  name = "string_cache"
5111
5066
  version = "0.9.0"
@@ -5430,9 +5385,9 @@ dependencies = [
5430
5385
 
5431
5386
  [[package]]
5432
5387
  name = "tokio"
5433
- version = "1.51.0"
5388
+ version = "1.51.1"
5434
5389
  source = "registry+https://github.com/rust-lang/crates.io-index"
5435
- checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd"
5390
+ checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c"
5436
5391
  dependencies = [
5437
5392
  "bytes",
5438
5393
  "libc",
@@ -5710,9 +5665,9 @@ checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782"
5710
5665
 
5711
5666
  [[package]]
5712
5667
  name = "tree-sitter-language-pack"
5713
- version = "1.4.1"
5668
+ version = "1.4.2"
5714
5669
  source = "registry+https://github.com/rust-lang/crates.io-index"
5715
- checksum = "fe94021e675329d320ded167b4dfe0b0faa8b321ecb0d8aa89fe035e0768e256"
5670
+ checksum = "6ed9cacce88ea8b3a92813649012c1ef387bc864f1dd8398843e8b9e076233b0"
5716
5671
  dependencies = [
5717
5672
  "ahash",
5718
5673
  "cc",
@@ -6007,6 +5962,12 @@ dependencies = [
6007
5962
  "wasm-bindgen",
6008
5963
  ]
6009
5964
 
5965
+ [[package]]
5966
+ name = "v_htmlescape"
5967
+ version = "0.15.8"
5968
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5969
+ checksum = "4e8257fbc510f0a46eb602c10215901938b5c2a7d5e70fc11483b1d3c9b5b18c"
5970
+
6010
5971
  [[package]]
6011
5972
  name = "valuable"
6012
5973
  version = "0.1.1"
@@ -6715,21 +6676,6 @@ version = "0.8.0"
6715
6676
  source = "registry+https://github.com/rust-lang/crates.io-index"
6716
6677
  checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448"
6717
6678
 
6718
- [[package]]
6719
- name = "yake-rust"
6720
- version = "1.0.3"
6721
- source = "registry+https://github.com/rust-lang/crates.io-index"
6722
- checksum = "9ce4fc48712dfe1427747a6b1f2bb1ca7b9abb7168fbc086774f890b6e4e097f"
6723
- dependencies = [
6724
- "hashbrown 0.15.5",
6725
- "indexmap",
6726
- "itertools 0.14.0",
6727
- "levenshtein",
6728
- "regex",
6729
- "segtok",
6730
- "streaming-stats",
6731
- ]
6732
-
6733
6679
  [[package]]
6734
6680
  name = "yansi"
6735
6681
  version = "1.0.1"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.8.0"
3
+ version = "4.8.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -55,7 +55,7 @@ rb-sys = { version = "0.9.126", default-features = false, features = [
55
55
  serde_json = "1.0.149"
56
56
  toml = "1.1.2"
57
57
  serde_yaml_ng = "0.10"
58
- tokio = { version = "1.51.0", features = [
58
+ tokio = { version = "1.51.1", features = [
59
59
  "rt",
60
60
  "rt-multi-thread",
61
61
  "macros",
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.8.0'
4
+ VERSION = '4.8.1'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.8.0"
5
+ version = "4.8.1"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.8.0", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.0" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.8.1", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.1" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.184"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.21", default-features = false }
48
+ pdf_oxide = { version = "0.3.22", default-features = false }
49
49
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
50
  rayon = "1.11.0"
51
51
  reqwest = { version = "0.13.2", default-features = false }
@@ -54,10 +54,10 @@ serde_json = { version = "1.0.149" }
54
54
  serde_toon_format = "0.1"
55
55
  tempfile = "3.27.0"
56
56
  thiserror = "2.0.18"
57
- tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
57
+ tokio = { version = "1.51.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
58
58
  toml = "1.1.2"
59
59
  tracing = "0.1"
60
- tree-sitter-language-pack = { version = "1.4.1", features = ["serde"], default-features = false }
60
+ tree-sitter-language-pack = { version = "1.4.2", features = ["serde"], default-features = false }
61
61
  wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
62
62
  wasm-bindgen-futures = "0.4"
63
63
  web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.8.0"
3
+ version = "4.8.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -62,7 +62,7 @@ office = [
62
62
  hwp = ["dep:cfb", "dep:flate2"]
63
63
  iwork = ["dep:zip", "dep:snap"]
64
64
  email = ["dep:mail-parser", "dep:cfb", "dep:outlook-pst", "dep:tempfile", "dep:chrono"]
65
- html = ["dep:html-to-markdown-rs"]
65
+ html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"]
66
66
  xml = ["dep:quick-xml", "dep:roxmltree"]
67
67
  archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
68
68
  mdx = []
@@ -144,7 +144,7 @@ embeddings = [
144
144
  stopwords = []
145
145
  quality = ["dep:unicode-normalization", "dep:chardetng", "stopwords"]
146
146
 
147
- keywords-yake = ["dep:yake-rust", "stopwords"]
147
+ keywords-yake = ["stopwords"]
148
148
  keywords-rake = ["dep:rake", "stopwords"]
149
149
  keywords = ["keywords-yake", "keywords-rake"]
150
150
 
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
314
  outlook-pst = { version = "1.2.0", optional = true }
315
315
  parking_lot = "0.12.5"
316
316
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.21", default-features = false, optional = true }
317
+ pdf_oxide = { version = "0.3.22", default-features = false, optional = true }
318
318
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
319
  pulldown-cmark = { version = "0.13" }
320
320
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
@@ -349,7 +349,7 @@ tokenizers = { version = "0.22", optional = true, default-features = false, feat
349
349
  "http",
350
350
  "fancy-regex",
351
351
  ] }
352
- tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
352
+ tokio = { version = "1.51.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
353
353
  toml = "1.1.2"
354
354
  tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
355
355
  tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
@@ -358,8 +358,8 @@ tracing-opentelemetry = { version = "0.32", optional = true }
358
358
  unicode-normalization = { version = "0.1.25", optional = true }
359
359
  urlencoding = "2"
360
360
  utoipa = { version = "5.4", features = ["axum_extras"], optional = true }
361
+ v_htmlescape = { version = "0.15", optional = true }
361
362
  whatlang = { version = "0.18.0", optional = true }
362
- yake-rust = { version = "1.0.3", optional = true }
363
363
  zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
364
364
  "deflate-flate2",
365
365
  ] }
@@ -392,7 +392,7 @@ optional = true
392
392
  # Override getrandom to enable js feature for WASM targets
393
393
  # This is needed because ring/rustls (via ureq) depend on getrandom without js feature
394
394
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
395
- tree-sitter-language-pack = { version = "1.4.1", features = ["wasm", "serde"], default-features = false, optional = true }
395
+ tree-sitter-language-pack = { version = "1.4.2", features = ["wasm", "serde"], default-features = false, optional = true }
396
396
  wasm-bindgen-rayon = { version = "1.3", optional = true }
397
397
 
398
398
  [build-dependencies]
@@ -409,7 +409,7 @@ jsonschema = "0.45"
409
409
  serial_test = "3.4.0"
410
410
  tar = "0.4.45"
411
411
  tempfile = "3.27.0"
412
- tokio = { version = "1.51.0", features = ["macros", "time"] }
412
+ tokio = { version = "1.51.1", features = ["macros", "time"] }
413
413
  tokio-test = "0.4"
414
414
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
415
415
  zip = { version = ">=7.0.0, <8.6.0", default-features = false, features = ["deflate-flate2"] }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.8.0 Release**
21
+ > **🚀 Version 4.8.1 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -19,7 +19,8 @@ fn default_true() -> bool {
19
19
  pub struct ContentFilterConfig {
20
20
  /// Include running headers in extraction output.
21
21
  ///
22
- /// - PDF: Disables top-margin furniture stripping.
22
+ /// - PDF: Disables top-margin furniture stripping and prevents the layout
23
+ /// model from treating `PageHeader`-classified regions as furniture.
23
24
  /// - DOCX: Includes document headers in text output.
24
25
  /// - RTF/ODT: Headers already included; this is a no-op when true.
25
26
  /// - HTML/EPUB: Keeps `<header>` element content.
@@ -30,7 +31,8 @@ pub struct ContentFilterConfig {
30
31
 
31
32
  /// Include running footers in extraction output.
32
33
  ///
33
- /// - PDF: Disables bottom-margin furniture stripping.
34
+ /// - PDF: Disables bottom-margin furniture stripping and prevents the layout
35
+ /// model from treating `PageFooter`-classified regions as furniture.
34
36
  /// - DOCX: Includes document footers in text output.
35
37
  /// - RTF/ODT: Footers already included; this is a no-op when true.
36
38
  /// - HTML/EPUB: Keeps `<footer>` element content.
@@ -39,11 +41,16 @@ pub struct ContentFilterConfig {
39
41
  #[serde(default)]
40
42
  pub include_footers: bool,
41
43
 
42
- /// Enable cross-page repeating text detection and removal.
44
+ /// Enable the heuristic cross-page repeating text detector.
43
45
  ///
44
- /// When `true` (default), text that repeats verbatim across a
45
- /// supermajority of pages is stripped. Disable this if brand names
46
- /// or repeated headings are being incorrectly removed.
46
+ /// When `true` (default), text that repeats verbatim across a supermajority
47
+ /// of pages is classified as furniture and stripped. Disable this if brand
48
+ /// names or repeated headings are being incorrectly removed by the heuristic.
49
+ ///
50
+ /// Note: when a layout-detection model is active, the model may independently
51
+ /// classify page-header / page-footer regions as furniture on a per-page basis.
52
+ /// To preserve those regions, set `include_headers = true` and/or
53
+ /// `include_footers = true` in addition to disabling this flag.
47
54
  ///
48
55
  /// Primarily affects PDF extraction.
49
56
  ///
@@ -121,6 +121,18 @@ pub struct ExtractionConfig {
121
121
  #[serde(default)]
122
122
  pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
123
123
 
124
+ /// Styled HTML output configuration.
125
+ ///
126
+ /// When set alongside `output_format = OutputFormat::Html`, the extraction
127
+ /// pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
128
+ /// which emits stable `kb-*` CSS class hooks on every structural element
129
+ /// and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
130
+ ///
131
+ /// When `None`, the existing plain comrak-based HTML renderer is used.
132
+ #[cfg(feature = "html")]
133
+ #[serde(default)]
134
+ pub html_output: Option<crate::core::config::html_output::HtmlOutputConfig>,
135
+
124
136
  /// Default per-file timeout in seconds for batch extraction.
125
137
  ///
126
138
  /// When set, each file in a batch will be canceled after this duration
@@ -272,6 +284,8 @@ impl Default for ExtractionConfig {
272
284
  postprocessor: None,
273
285
  #[cfg(feature = "html")]
274
286
  html_options: None,
287
+ #[cfg(feature = "html")]
288
+ html_output: None,
275
289
  extraction_timeout_secs: None,
276
290
  max_concurrent_extractions: None,
277
291
  #[cfg(feature = "archives")]
@@ -0,0 +1,136 @@
1
+ //! HTML output configuration.
2
+ //!
3
+ //! Controls how `OutputFormat::Html` renders an `InternalDocument`:
4
+ //! which built-in theme to use, whether to embed the CSS in a `<style>`
5
+ //! block, and optional user-supplied CSS (inline string or file path).
6
+
7
+ use std::path::PathBuf;
8
+
9
+ use serde::{Deserialize, Serialize};
10
+
11
+ fn default_class_prefix() -> String {
12
+ "kb-".to_string()
13
+ }
14
+
15
+ fn default_true() -> bool {
16
+ true
17
+ }
18
+
19
+ /// Configuration for styled HTML output.
20
+ ///
21
+ /// When set on [`ExtractionConfig::html_output`] alongside
22
+ /// `output_format = OutputFormat::Html`, the pipeline builds a
23
+ /// [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
24
+ /// the plain comrak-based renderer.
25
+ ///
26
+ /// # Example
27
+ ///
28
+ /// ```rust
29
+ /// use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
30
+ ///
31
+ /// let config = HtmlOutputConfig {
32
+ /// theme: HtmlTheme::GitHub,
33
+ /// css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
34
+ /// ..Default::default()
35
+ /// };
36
+ /// ```
37
+ #[derive(Debug, Clone, Serialize, Deserialize)]
38
+ pub struct HtmlOutputConfig {
39
+ /// Inline CSS string injected into the output after the theme stylesheet.
40
+ /// Concatenated after `css_file` content when both are set.
41
+ #[serde(default)]
42
+ pub css: Option<String>,
43
+
44
+ /// Path to a CSS file loaded once at renderer construction time.
45
+ /// Concatenated before `css` when both are set.
46
+ #[serde(default)]
47
+ pub css_file: Option<PathBuf>,
48
+
49
+ /// Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].
50
+ #[serde(default)]
51
+ pub theme: HtmlTheme,
52
+
53
+ /// CSS class prefix applied to every emitted class name.
54
+ ///
55
+ /// Default: `"kb-"`. Change this if your host application already uses
56
+ /// classes that start with `kb-`.
57
+ #[serde(default = "default_class_prefix")]
58
+ pub class_prefix: String,
59
+
60
+ /// When `true` (default), write the resolved CSS into a `<style>` block
61
+ /// immediately after the opening `<div class="{prefix}doc">`.
62
+ ///
63
+ /// Set to `false` to emit only the structural markup and wire up your
64
+ /// own stylesheet targeting the `kb-*` class names.
65
+ #[serde(default = "default_true")]
66
+ pub embed_css: bool,
67
+ }
68
+
69
+ impl Default for HtmlOutputConfig {
70
+ fn default() -> Self {
71
+ Self {
72
+ css: None,
73
+ css_file: None,
74
+ theme: HtmlTheme::Unstyled,
75
+ class_prefix: default_class_prefix(),
76
+ embed_css: true,
77
+ }
78
+ }
79
+ }
80
+
81
+ /// Built-in HTML theme selection.
82
+ #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
83
+ #[serde(rename_all = "lowercase")]
84
+ pub enum HtmlTheme {
85
+ /// Sensible defaults: system font stack, neutral colours, readable line
86
+ /// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
87
+ /// can override individual values.
88
+ Default,
89
+ /// GitHub Markdown-inspired palette and spacing.
90
+ GitHub,
91
+ /// Dark background, light text.
92
+ Dark,
93
+ /// Minimal light theme with generous whitespace.
94
+ Light,
95
+ /// No built-in stylesheet emitted. CSS custom properties are still defined
96
+ /// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
97
+ #[default]
98
+ Unstyled,
99
+ }
100
+
101
+ #[cfg(test)]
102
+ mod tests {
103
+ use super::*;
104
+
105
+ #[test]
106
+ fn default_config_values() {
107
+ let cfg = HtmlOutputConfig::default();
108
+ assert_eq!(cfg.class_prefix, "kb-");
109
+ assert!(cfg.embed_css);
110
+ assert!(cfg.css.is_none());
111
+ assert!(cfg.css_file.is_none());
112
+ assert_eq!(cfg.theme, HtmlTheme::Unstyled);
113
+ }
114
+
115
+ #[test]
116
+ fn serde_roundtrip() {
117
+ let cfg = HtmlOutputConfig {
118
+ css: Some(".kb-p { color: red; }".to_string()),
119
+ theme: HtmlTheme::GitHub,
120
+ embed_css: false,
121
+ ..Default::default()
122
+ };
123
+ let json = serde_json::to_string(&cfg).unwrap();
124
+ let back: HtmlOutputConfig = serde_json::from_str(&json).unwrap();
125
+ assert_eq!(back.css, cfg.css);
126
+ assert_eq!(back.theme, HtmlTheme::GitHub);
127
+ assert!(!back.embed_css);
128
+ }
129
+
130
+ #[test]
131
+ fn theme_serde() {
132
+ assert_eq!(serde_json::to_string(&HtmlTheme::GitHub).unwrap(), "\"github\"");
133
+ let t: HtmlTheme = serde_json::from_str("\"dark\"").unwrap();
134
+ assert_eq!(t, HtmlTheme::Dark);
135
+ }
136
+ }
@@ -9,6 +9,8 @@ pub mod content_filter;
9
9
  pub mod email;
10
10
  pub mod extraction;
11
11
  pub mod formats;
12
+ #[cfg(feature = "html")]
13
+ pub mod html_output;
12
14
  pub mod layout;
13
15
  pub mod llm;
14
16
  pub mod merge;
@@ -28,6 +30,8 @@ pub use extraction::{
28
30
  ExtractionConfig, FileExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, TokenReductionConfig,
29
31
  };
30
32
  pub use formats::OutputFormat;
33
+ #[cfg(feature = "html")]
34
+ pub use html_output::{HtmlOutputConfig, HtmlTheme};
31
35
  #[cfg(feature = "layout-detection")]
32
36
  pub use layout::{LayoutDetectionConfig, TableModel};
33
37
  pub use llm::{LlmConfig, StructuredExtractionConfig};
@@ -40,7 +40,7 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
40
40
  // Try exact match first
41
41
  if let Some(pos) = content[search_offset..].find(&page.content) {
42
42
  let byte_start = search_offset + pos;
43
- let byte_end = byte_start + page.content.len();
43
+ let byte_end = content.floor_char_boundary(byte_start + page.content.len());
44
44
  boundaries.push(PageBoundary {
45
45
  page_number: page.page_number,
46
46
  byte_start,
@@ -55,7 +55,8 @@ fn recompute_boundaries_from_pages(content: &str, pages: &[crate::types::PageCon
55
55
  && let Some(pos) = content[search_offset..].find(line)
56
56
  {
57
57
  let byte_start = search_offset + pos;
58
- let byte_end = (byte_start + page.content.len()).min(content.len());
58
+ let raw_end = (byte_start + page.content.len()).min(content.len());
59
+ let byte_end = content.floor_char_boundary(raw_end);
59
60
  boundaries.push(PageBoundary {
60
61
  page_number: page.page_number,
61
62
  byte_start,