kreuzberg 4.5.1 → 4.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +4 -4
- data/ext/kreuzberg_rb/native/Cargo.lock +90 -47
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -9
- data/lib/kreuzberg/config.rb +25 -8
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +7 -2
- data/vendor/Cargo.toml +6 -5
- data/vendor/kreuzberg/Cargo.toml +144 -111
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/handlers.rs +483 -2
- data/vendor/kreuzberg/src/api/mod.rs +7 -2
- data/vendor/kreuzberg/src/api/openapi.rs +19 -0
- data/vendor/kreuzberg/src/api/router.rs +7 -3
- data/vendor/kreuzberg/src/api/types.rs +75 -0
- data/vendor/kreuzberg/src/cache/core.rs +223 -122
- data/vendor/kreuzberg/src/cache/mod.rs +20 -16
- data/vendor/kreuzberg/src/cache/utilities.rs +62 -44
- data/vendor/kreuzberg/src/chunking/core.rs +47 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +18 -0
- data/vendor/kreuzberg/src/core/config/layout.rs +12 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +19 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +79 -0
- data/vendor/kreuzberg/src/core/mime.rs +42 -1
- data/vendor/kreuzberg/src/extraction/hwp/error.rs +54 -0
- data/vendor/kreuzberg/src/extraction/hwp/mod.rs +72 -0
- data/vendor/kreuzberg/src/extraction/hwp/model.rs +102 -0
- data/vendor/kreuzberg/src/extraction/hwp/parser.rs +174 -0
- data/vendor/kreuzberg/src/extraction/hwp/reader.rs +126 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +3 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +58 -7
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +19 -5
- data/vendor/kreuzberg/src/extractors/hwp.rs +4 -5
- data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +189 -0
- data/vendor/kreuzberg/src/extractors/iwork/mod.rs +291 -0
- data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +186 -0
- data/vendor/kreuzberg/src/extractors/iwork/pages.rs +182 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +13 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +18 -3
- data/vendor/kreuzberg/src/layout/engine.rs +3 -0
- data/vendor/kreuzberg/src/layout/mod.rs +133 -0
- data/vendor/kreuzberg/src/layout/model_manager.rs +61 -2
- data/vendor/kreuzberg/src/layout/models/mod.rs +2 -0
- data/vendor/kreuzberg/src/layout/models/slanet.rs +550 -0
- data/vendor/kreuzberg/src/layout/models/table_classifier.rs +219 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -1
- data/vendor/kreuzberg/src/mcp/params.rs +87 -0
- data/vendor/kreuzberg/src/mcp/server.rs +585 -5
- data/vendor/kreuzberg/src/ocr/cache.rs +1 -1
- data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +21 -23
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +6 -25
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +29 -9
- data/vendor/kreuzberg/src/ocr/tessdata_manager.rs +254 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +6 -10
- data/vendor/kreuzberg/src/pdf/images.rs +13 -0
- data/vendor/kreuzberg/src/pdf/layout_runner.rs +11 -0
- data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +9 -1
- data/vendor/kreuzberg/src/pdf/markdown/classify.rs +98 -6
- data/vendor/kreuzberg/src/pdf/markdown/mod.rs +1 -1
- data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +273 -51
- data/vendor/kreuzberg/src/pdf/markdown/regions/mod.rs +2 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/table_recognition.rs +334 -1
- data/vendor/kreuzberg/src/pdf/markdown/regions/tables.rs +11 -1
- data/vendor/kreuzberg/src/pdf/markdown/render.rs +22 -16
- data/vendor/kreuzberg/src/pdf/markdown/text_repair.rs +209 -47
- data/vendor/kreuzberg/src/pdf/oxide_text.rs +10 -1
- data/vendor/kreuzberg/src/pdf/text.rs +2 -2
- data/vendor/kreuzberg/src/pdf/text_data.rs +15 -6
- data/vendor/kreuzberg/tests/epub_markdown_headings_tests.rs +177 -0
- data/vendor/kreuzberg/tests/instrumentation_test.rs +2 -2
- data/vendor/kreuzberg/tests/iwork_integration.rs +220 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +14 -14
- data/vendor/kreuzberg-ffi/kreuzberg.h +46 -2
- data/vendor/kreuzberg-ffi/src/config_builder.rs +81 -0
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +14 -14
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +17 -17
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text/segment.rs +13 -0
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text.rs +148 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +27 -27
- data/vendor/kreuzberg-tesseract/build.rs +61 -0
- metadata +16 -6
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +0 -179
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +0 -431
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +0 -150
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +0 -11
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 47a14cc623891453596552fd893b43f18ffa04068de61b47b34b7f18ad8af890
|
|
4
|
+
data.tar.gz: 5e18a52f5acbabba2ee64790b3831c89301e58043c8b22ba7616791a2338401e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9ac2251c79bcdff41d8746ea3244bf3774cd022aa0f9c63a0153a7ec394f86d1bfd4fe3b364d175371f602006aafea17f612250b2b8368e213989b910f17940e
|
|
7
|
+
data.tar.gz: 8494619ec2253eaeb68b95fc4e204355ab2a08f71f5863f88b8305c7c07333a456f73c497804de7ca7693b1207bd0afaef3e6ca0ab5cb811eee1686823bff31d
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.5.
|
|
4
|
+
kreuzberg (4.5.4)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -222,7 +222,7 @@ CHECKSUMS
|
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
223
|
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.5.
|
|
225
|
+
kreuzberg (4.5.4)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.4" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -61,7 +61,7 @@
|
|
|
61
61
|
</div>
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
Extract text, tables, images, and metadata from
|
|
64
|
+
Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
## Installation
|
|
@@ -211,9 +211,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
211
211
|
|
|
212
212
|
## Features
|
|
213
213
|
|
|
214
|
-
### Supported File Formats (
|
|
214
|
+
### Supported File Formats (91+)
|
|
215
215
|
|
|
216
|
-
|
|
216
|
+
91+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
217
217
|
|
|
218
218
|
#### Office Documents
|
|
219
219
|
|
|
@@ -161,6 +161,12 @@ dependencies = [
|
|
|
161
161
|
"syn",
|
|
162
162
|
]
|
|
163
163
|
|
|
164
|
+
[[package]]
|
|
165
|
+
name = "arrayref"
|
|
166
|
+
version = "0.3.9"
|
|
167
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
168
|
+
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
|
|
169
|
+
|
|
164
170
|
[[package]]
|
|
165
171
|
name = "arrayvec"
|
|
166
172
|
version = "0.7.6"
|
|
@@ -473,6 +479,20 @@ dependencies = [
|
|
|
473
479
|
"wyz",
|
|
474
480
|
]
|
|
475
481
|
|
|
482
|
+
[[package]]
|
|
483
|
+
name = "blake3"
|
|
484
|
+
version = "1.8.3"
|
|
485
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
486
|
+
checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d"
|
|
487
|
+
dependencies = [
|
|
488
|
+
"arrayref",
|
|
489
|
+
"arrayvec",
|
|
490
|
+
"cc",
|
|
491
|
+
"cfg-if",
|
|
492
|
+
"constant_time_eq 0.4.2",
|
|
493
|
+
"cpufeatures 0.2.17",
|
|
494
|
+
]
|
|
495
|
+
|
|
476
496
|
[[package]]
|
|
477
497
|
name = "block-buffer"
|
|
478
498
|
version = "0.10.4"
|
|
@@ -916,6 +936,12 @@ version = "0.3.1"
|
|
|
916
936
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
917
937
|
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
|
|
918
938
|
|
|
939
|
+
[[package]]
|
|
940
|
+
name = "constant_time_eq"
|
|
941
|
+
version = "0.4.2"
|
|
942
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
943
|
+
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
|
|
944
|
+
|
|
919
945
|
[[package]]
|
|
920
946
|
name = "cookie"
|
|
921
947
|
version = "0.18.1"
|
|
@@ -1189,15 +1215,15 @@ checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2"
|
|
|
1189
1215
|
|
|
1190
1216
|
[[package]]
|
|
1191
1217
|
name = "deflate64"
|
|
1192
|
-
version = "0.1.
|
|
1218
|
+
version = "0.1.12"
|
|
1193
1219
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1194
|
-
checksum = "
|
|
1220
|
+
checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
|
|
1195
1221
|
|
|
1196
1222
|
[[package]]
|
|
1197
1223
|
name = "der"
|
|
1198
|
-
version = "0.
|
|
1224
|
+
version = "0.8.0"
|
|
1199
1225
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1200
|
-
checksum = "
|
|
1226
|
+
checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b"
|
|
1201
1227
|
dependencies = [
|
|
1202
1228
|
"pem-rfc7468",
|
|
1203
1229
|
"zeroize",
|
|
@@ -2001,7 +2027,7 @@ dependencies = [
|
|
|
2001
2027
|
"serde",
|
|
2002
2028
|
"serde_json",
|
|
2003
2029
|
"thiserror 2.0.18",
|
|
2004
|
-
"ureq 3.
|
|
2030
|
+
"ureq 3.3.0",
|
|
2005
2031
|
"windows-sys 0.61.2",
|
|
2006
2032
|
]
|
|
2007
2033
|
|
|
@@ -2031,9 +2057,9 @@ dependencies = [
|
|
|
2031
2057
|
|
|
2032
2058
|
[[package]]
|
|
2033
2059
|
name = "html-to-markdown-rs"
|
|
2034
|
-
version = "2.
|
|
2060
|
+
version = "2.29.0"
|
|
2035
2061
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2036
|
-
checksum = "
|
|
2062
|
+
checksum = "9013679b8c3600142e5a8f742748c3c38c49d9fc50675dad62f8f1721090a85a"
|
|
2037
2063
|
dependencies = [
|
|
2038
2064
|
"ahash",
|
|
2039
2065
|
"astral-tl",
|
|
@@ -2643,9 +2669,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
|
|
|
2643
2669
|
|
|
2644
2670
|
[[package]]
|
|
2645
2671
|
name = "iri-string"
|
|
2646
|
-
version = "0.7.
|
|
2672
|
+
version = "0.7.11"
|
|
2647
2673
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2648
|
-
checksum = "
|
|
2674
|
+
checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb"
|
|
2649
2675
|
dependencies = [
|
|
2650
2676
|
"memchr",
|
|
2651
2677
|
"serde",
|
|
@@ -2731,7 +2757,7 @@ dependencies = [
|
|
|
2731
2757
|
"cesu8",
|
|
2732
2758
|
"cfg-if",
|
|
2733
2759
|
"combine",
|
|
2734
|
-
"jni-sys",
|
|
2760
|
+
"jni-sys 0.3.1",
|
|
2735
2761
|
"log",
|
|
2736
2762
|
"thiserror 1.0.69",
|
|
2737
2763
|
"walkdir",
|
|
@@ -2740,9 +2766,31 @@ dependencies = [
|
|
|
2740
2766
|
|
|
2741
2767
|
[[package]]
|
|
2742
2768
|
name = "jni-sys"
|
|
2743
|
-
version = "0.3.
|
|
2769
|
+
version = "0.3.1"
|
|
2770
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2771
|
+
checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258"
|
|
2772
|
+
dependencies = [
|
|
2773
|
+
"jni-sys 0.4.1",
|
|
2774
|
+
]
|
|
2775
|
+
|
|
2776
|
+
[[package]]
|
|
2777
|
+
name = "jni-sys"
|
|
2778
|
+
version = "0.4.1"
|
|
2779
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2780
|
+
checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2"
|
|
2781
|
+
dependencies = [
|
|
2782
|
+
"jni-sys-macros",
|
|
2783
|
+
]
|
|
2784
|
+
|
|
2785
|
+
[[package]]
|
|
2786
|
+
name = "jni-sys-macros"
|
|
2787
|
+
version = "0.4.1"
|
|
2744
2788
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2745
|
-
checksum = "
|
|
2789
|
+
checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264"
|
|
2790
|
+
dependencies = [
|
|
2791
|
+
"quote",
|
|
2792
|
+
"syn",
|
|
2793
|
+
]
|
|
2746
2794
|
|
|
2747
2795
|
[[package]]
|
|
2748
2796
|
name = "jobserver"
|
|
@@ -2781,7 +2829,7 @@ dependencies = [
|
|
|
2781
2829
|
|
|
2782
2830
|
[[package]]
|
|
2783
2831
|
name = "kreuzberg"
|
|
2784
|
-
version = "4.5.
|
|
2832
|
+
version = "4.5.3"
|
|
2785
2833
|
dependencies = [
|
|
2786
2834
|
"ahash",
|
|
2787
2835
|
"async-trait",
|
|
@@ -2790,6 +2838,7 @@ dependencies = [
|
|
|
2790
2838
|
"biblatex",
|
|
2791
2839
|
"biblib",
|
|
2792
2840
|
"bitvec",
|
|
2841
|
+
"blake3",
|
|
2793
2842
|
"bytes",
|
|
2794
2843
|
"calamine",
|
|
2795
2844
|
"cfb 0.14.0",
|
|
@@ -2843,6 +2892,7 @@ dependencies = [
|
|
|
2843
2892
|
"serde_yaml_ng",
|
|
2844
2893
|
"sevenz-rust2",
|
|
2845
2894
|
"sha2",
|
|
2895
|
+
"snap",
|
|
2846
2896
|
"tar",
|
|
2847
2897
|
"text-splitter",
|
|
2848
2898
|
"thiserror 2.0.18",
|
|
@@ -2855,16 +2905,16 @@ dependencies = [
|
|
|
2855
2905
|
"tracing",
|
|
2856
2906
|
"tracing-opentelemetry",
|
|
2857
2907
|
"unicode-normalization",
|
|
2858
|
-
"ureq 3.
|
|
2908
|
+
"ureq 3.3.0",
|
|
2859
2909
|
"utoipa",
|
|
2860
2910
|
"whatlang",
|
|
2861
2911
|
"yake-rust",
|
|
2862
|
-
"zip
|
|
2912
|
+
"zip 7.2.0",
|
|
2863
2913
|
]
|
|
2864
2914
|
|
|
2865
2915
|
[[package]]
|
|
2866
2916
|
name = "kreuzberg-ffi"
|
|
2867
|
-
version = "4.5.
|
|
2917
|
+
version = "4.5.3"
|
|
2868
2918
|
dependencies = [
|
|
2869
2919
|
"ahash",
|
|
2870
2920
|
"async-trait",
|
|
@@ -2880,7 +2930,7 @@ dependencies = [
|
|
|
2880
2930
|
|
|
2881
2931
|
[[package]]
|
|
2882
2932
|
name = "kreuzberg-paddle-ocr"
|
|
2883
|
-
version = "4.5.
|
|
2933
|
+
version = "4.5.3"
|
|
2884
2934
|
dependencies = [
|
|
2885
2935
|
"geo-clipper",
|
|
2886
2936
|
"geo-types",
|
|
@@ -2894,7 +2944,7 @@ dependencies = [
|
|
|
2894
2944
|
|
|
2895
2945
|
[[package]]
|
|
2896
2946
|
name = "kreuzberg-pdfium-render"
|
|
2897
|
-
version = "4.5.
|
|
2947
|
+
version = "4.5.3"
|
|
2898
2948
|
dependencies = [
|
|
2899
2949
|
"bitflags",
|
|
2900
2950
|
"bytemuck",
|
|
@@ -2917,7 +2967,7 @@ dependencies = [
|
|
|
2917
2967
|
|
|
2918
2968
|
[[package]]
|
|
2919
2969
|
name = "kreuzberg-rb"
|
|
2920
|
-
version = "4.5.
|
|
2970
|
+
version = "4.5.3"
|
|
2921
2971
|
dependencies = [
|
|
2922
2972
|
"async-trait",
|
|
2923
2973
|
"html-to-markdown-rs",
|
|
@@ -2934,13 +2984,13 @@ dependencies = [
|
|
|
2934
2984
|
|
|
2935
2985
|
[[package]]
|
|
2936
2986
|
name = "kreuzberg-tesseract"
|
|
2937
|
-
version = "4.5.
|
|
2987
|
+
version = "4.5.3"
|
|
2938
2988
|
dependencies = [
|
|
2939
2989
|
"cc",
|
|
2940
2990
|
"cmake",
|
|
2941
2991
|
"reqwest",
|
|
2942
2992
|
"thiserror 2.0.18",
|
|
2943
|
-
"zip
|
|
2993
|
+
"zip 7.2.0",
|
|
2944
2994
|
]
|
|
2945
2995
|
|
|
2946
2996
|
[[package]]
|
|
@@ -3712,7 +3762,7 @@ dependencies = [
|
|
|
3712
3762
|
"ort-sys",
|
|
3713
3763
|
"smallvec",
|
|
3714
3764
|
"tracing",
|
|
3715
|
-
"ureq 3.
|
|
3765
|
+
"ureq 3.3.0",
|
|
3716
3766
|
]
|
|
3717
3767
|
|
|
3718
3768
|
[[package]]
|
|
@@ -3723,7 +3773,7 @@ checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90"
|
|
|
3723
3773
|
dependencies = [
|
|
3724
3774
|
"hmac-sha256",
|
|
3725
3775
|
"lzma-rust2 0.15.7",
|
|
3726
|
-
"ureq 3.
|
|
3776
|
+
"ureq 3.3.0",
|
|
3727
3777
|
]
|
|
3728
3778
|
|
|
3729
3779
|
[[package]]
|
|
@@ -3779,9 +3829,9 @@ dependencies = [
|
|
|
3779
3829
|
|
|
3780
3830
|
[[package]]
|
|
3781
3831
|
name = "pem-rfc7468"
|
|
3782
|
-
version = "0.
|
|
3832
|
+
version = "1.0.0"
|
|
3783
3833
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3784
|
-
checksum = "
|
|
3834
|
+
checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9"
|
|
3785
3835
|
dependencies = [
|
|
3786
3836
|
"base64ct",
|
|
3787
3837
|
]
|
|
@@ -3977,9 +4027,9 @@ dependencies = [
|
|
|
3977
4027
|
|
|
3978
4028
|
[[package]]
|
|
3979
4029
|
name = "pulldown-cmark"
|
|
3980
|
-
version = "0.13.
|
|
4030
|
+
version = "0.13.3"
|
|
3981
4031
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3982
|
-
checksum = "
|
|
4032
|
+
checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad"
|
|
3983
4033
|
dependencies = [
|
|
3984
4034
|
"bitflags",
|
|
3985
4035
|
"getopts",
|
|
@@ -4952,6 +5002,12 @@ version = "1.15.1"
|
|
|
4952
5002
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4953
5003
|
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
|
4954
5004
|
|
|
5005
|
+
[[package]]
|
|
5006
|
+
name = "snap"
|
|
5007
|
+
version = "1.1.1"
|
|
5008
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5009
|
+
checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
|
|
5010
|
+
|
|
4955
5011
|
[[package]]
|
|
4956
5012
|
name = "socket2"
|
|
4957
5013
|
version = "0.6.3"
|
|
@@ -5745,9 +5801,9 @@ dependencies = [
|
|
|
5745
5801
|
|
|
5746
5802
|
[[package]]
|
|
5747
5803
|
name = "ureq"
|
|
5748
|
-
version = "3.
|
|
5804
|
+
version = "3.3.0"
|
|
5749
5805
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5750
|
-
checksum = "
|
|
5806
|
+
checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
|
|
5751
5807
|
dependencies = [
|
|
5752
5808
|
"base64 0.22.1",
|
|
5753
5809
|
"cookie_store",
|
|
@@ -5769,9 +5825,9 @@ dependencies = [
|
|
|
5769
5825
|
|
|
5770
5826
|
[[package]]
|
|
5771
5827
|
name = "ureq-proto"
|
|
5772
|
-
version = "0.
|
|
5828
|
+
version = "0.6.0"
|
|
5773
5829
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5774
|
-
checksum = "
|
|
5830
|
+
checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
|
|
5775
5831
|
dependencies = [
|
|
5776
5832
|
"base64 0.22.1",
|
|
5777
5833
|
"http",
|
|
@@ -6784,7 +6840,7 @@ dependencies = [
|
|
|
6784
6840
|
"aes",
|
|
6785
6841
|
"arbitrary",
|
|
6786
6842
|
"bzip2 0.5.2",
|
|
6787
|
-
"constant_time_eq",
|
|
6843
|
+
"constant_time_eq 0.3.1",
|
|
6788
6844
|
"crc32fast",
|
|
6789
6845
|
"crossbeam-utils",
|
|
6790
6846
|
"deflate64",
|
|
@@ -6819,19 +6875,6 @@ dependencies = [
|
|
|
6819
6875
|
"zopfli",
|
|
6820
6876
|
]
|
|
6821
6877
|
|
|
6822
|
-
[[package]]
|
|
6823
|
-
name = "zip"
|
|
6824
|
-
version = "8.3.0"
|
|
6825
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6826
|
-
checksum = "4a243cfad17427fc077f529da5a95abe4e94fd2bfdb601611870a6557cc67657"
|
|
6827
|
-
dependencies = [
|
|
6828
|
-
"crc32fast",
|
|
6829
|
-
"flate2",
|
|
6830
|
-
"indexmap",
|
|
6831
|
-
"memchr",
|
|
6832
|
-
"typed-path",
|
|
6833
|
-
]
|
|
6834
|
-
|
|
6835
6878
|
[[package]]
|
|
6836
6879
|
name = "zlib-rs"
|
|
6837
6880
|
version = "0.6.3"
|
|
@@ -6901,9 +6944,9 @@ dependencies = [
|
|
|
6901
6944
|
|
|
6902
6945
|
[[package]]
|
|
6903
6946
|
name = "zune-jpeg"
|
|
6904
|
-
version = "0.5.
|
|
6947
|
+
version = "0.5.14"
|
|
6905
6948
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6906
|
-
checksum = "
|
|
6949
|
+
checksum = "0b7a1c0af6e5d8d1363f4994b7a091ccf963d8b694f7da5b0b9cceb82da2c0a6"
|
|
6907
6950
|
dependencies = [
|
|
6908
6951
|
"zune-core",
|
|
6909
6952
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.5.
|
|
3
|
+
version = "4.5.4"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -64,7 +64,7 @@ tokio = { version = "1.50.0", features = [
|
|
|
64
64
|
"time",
|
|
65
65
|
"io-util",
|
|
66
66
|
] }
|
|
67
|
-
html-to-markdown-rs = { version = "2.
|
|
67
|
+
html-to-markdown-rs = { version = "2.29.0", default-features = false }
|
|
68
68
|
|
|
69
69
|
[dev-dependencies]
|
|
70
70
|
pretty_assertions = "1.4"
|
|
@@ -139,6 +139,12 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
|
|
|
139
139
|
|
|
140
140
|
let sizing = parse_chunk_sizing(ruby, hash)?;
|
|
141
141
|
|
|
142
|
+
let prepend_heading_context = if let Some(val) = get_kw(ruby, hash, "prepend_heading_context") {
|
|
143
|
+
bool::try_convert(val)?
|
|
144
|
+
} else {
|
|
145
|
+
false
|
|
146
|
+
};
|
|
147
|
+
|
|
142
148
|
let config = ChunkingConfig {
|
|
143
149
|
max_characters: max_chars,
|
|
144
150
|
overlap: max_overlap,
|
|
@@ -147,6 +153,7 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
|
|
|
147
153
|
embedding,
|
|
148
154
|
preset,
|
|
149
155
|
sizing,
|
|
156
|
+
prepend_heading_context,
|
|
150
157
|
};
|
|
151
158
|
|
|
152
159
|
Ok(config)
|
|
@@ -315,12 +322,11 @@ pub fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
|
|
|
315
322
|
None
|
|
316
323
|
};
|
|
317
324
|
|
|
318
|
-
let allow_single_column_tables =
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
};
|
|
325
|
+
let allow_single_column_tables = if let Some(val) = get_kw(ruby, hash, "allow_single_column_tables") {
|
|
326
|
+
bool::try_convert(val)?
|
|
327
|
+
} else {
|
|
328
|
+
false
|
|
329
|
+
};
|
|
324
330
|
|
|
325
331
|
let config = PdfConfig {
|
|
326
332
|
extract_images,
|
|
@@ -819,10 +825,19 @@ pub fn parse_layout_detection_config(ruby: &Ruby, hash: RHash) -> Result<LayoutD
|
|
|
819
825
|
true
|
|
820
826
|
};
|
|
821
827
|
|
|
828
|
+
let table_model = if let Some(val) = get_kw(ruby, hash, "table_model")
|
|
829
|
+
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
830
|
+
{
|
|
831
|
+
Some(String::try_convert(val)?)
|
|
832
|
+
} else {
|
|
833
|
+
None
|
|
834
|
+
};
|
|
835
|
+
|
|
822
836
|
let config = LayoutDetectionConfig {
|
|
823
837
|
preset,
|
|
824
838
|
confidence_threshold,
|
|
825
839
|
apply_heuristics,
|
|
840
|
+
table_model,
|
|
826
841
|
};
|
|
827
842
|
|
|
828
843
|
Ok(config)
|
|
@@ -952,9 +967,8 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
|
|
|
952
967
|
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
953
968
|
{
|
|
954
969
|
let security_json = ruby_value_to_json(val)?;
|
|
955
|
-
let parsed: kreuzberg::extractors::security::SecurityLimits =
|
|
956
|
-
|
|
957
|
-
.map_err(|e| runtime_error(format!("Invalid security_limits: {}", e)))?;
|
|
970
|
+
let parsed: kreuzberg::extractors::security::SecurityLimits = serde_json::from_value(security_json)
|
|
971
|
+
.map_err(|e| runtime_error(format!("Invalid security_limits: {}", e)))?;
|
|
958
972
|
config.security_limits = Some(parsed);
|
|
959
973
|
}
|
|
960
974
|
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -850,19 +850,21 @@ module Kreuzberg
|
|
|
850
850
|
# )
|
|
851
851
|
#
|
|
852
852
|
class LayoutDetection
|
|
853
|
-
attr_reader :preset, :confidence_threshold, :apply_heuristics
|
|
853
|
+
attr_reader :preset, :confidence_threshold, :apply_heuristics, :table_model
|
|
854
854
|
|
|
855
|
-
def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true)
|
|
855
|
+
def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true, table_model: nil)
|
|
856
856
|
@preset = preset.to_s
|
|
857
857
|
@confidence_threshold = confidence_threshold&.to_f
|
|
858
858
|
@apply_heuristics = apply_heuristics ? true : false
|
|
859
|
+
@table_model = table_model&.to_s
|
|
859
860
|
end
|
|
860
861
|
|
|
861
862
|
def to_h
|
|
862
863
|
{
|
|
863
864
|
preset: @preset,
|
|
864
865
|
confidence_threshold: @confidence_threshold,
|
|
865
|
-
apply_heuristics: @apply_heuristics
|
|
866
|
+
apply_heuristics: @apply_heuristics,
|
|
867
|
+
table_model: @table_model
|
|
866
868
|
}.compact
|
|
867
869
|
end
|
|
868
870
|
end
|
|
@@ -930,7 +932,8 @@ module Kreuzberg
|
|
|
930
932
|
:images, :postprocessor,
|
|
931
933
|
:token_reduction, :keywords, :html_options, :pages,
|
|
932
934
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
933
|
-
:security_limits, :layout, :concurrency
|
|
935
|
+
:security_limits, :layout, :concurrency,
|
|
936
|
+
:cache_namespace, :cache_ttl_secs
|
|
934
937
|
|
|
935
938
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
936
939
|
alias image_extraction images
|
|
@@ -955,7 +958,7 @@ module Kreuzberg
|
|
|
955
958
|
language_detection pdf_options image_extraction
|
|
956
959
|
postprocessor token_reduction keywords html_options pages
|
|
957
960
|
max_concurrent_extractions output_format result_format
|
|
958
|
-
security_limits layout concurrency
|
|
961
|
+
security_limits layout concurrency cache_namespace cache_ttl_secs
|
|
959
962
|
].freeze
|
|
960
963
|
|
|
961
964
|
# Aliases for backward compatibility
|
|
@@ -1032,7 +1035,9 @@ module Kreuzberg
|
|
|
1032
1035
|
result_format: nil,
|
|
1033
1036
|
security_limits: nil,
|
|
1034
1037
|
layout: nil,
|
|
1035
|
-
concurrency: nil
|
|
1038
|
+
concurrency: nil,
|
|
1039
|
+
cache_namespace: nil,
|
|
1040
|
+
cache_ttl_secs: nil)
|
|
1036
1041
|
kwargs = {
|
|
1037
1042
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1038
1043
|
force_ocr: force_ocr, include_document_structure: include_document_structure,
|
|
@@ -1043,7 +1048,9 @@ module Kreuzberg
|
|
|
1043
1048
|
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
1044
1049
|
output_format: output_format, result_format: result_format,
|
|
1045
1050
|
security_limits: security_limits, layout: layout,
|
|
1046
|
-
concurrency: concurrency
|
|
1051
|
+
concurrency: concurrency,
|
|
1052
|
+
cache_namespace: cache_namespace,
|
|
1053
|
+
cache_ttl_secs: cache_ttl_secs
|
|
1047
1054
|
}
|
|
1048
1055
|
extracted = extract_from_hash(hash, kwargs)
|
|
1049
1056
|
|
|
@@ -1077,6 +1084,8 @@ module Kreuzberg
|
|
|
1077
1084
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
1078
1085
|
@output_format = validate_output_format(params[:output_format])
|
|
1079
1086
|
@result_format = validate_result_format(params[:result_format])
|
|
1087
|
+
@cache_namespace = params[:cache_namespace]
|
|
1088
|
+
@cache_ttl_secs = params[:cache_ttl_secs]&.to_i
|
|
1080
1089
|
@security_limits = params[:security_limits]
|
|
1081
1090
|
end
|
|
1082
1091
|
|
|
@@ -1112,7 +1121,9 @@ module Kreuzberg
|
|
|
1112
1121
|
include_document_structure: @include_document_structure,
|
|
1113
1122
|
max_concurrent_extractions: @max_concurrent_extractions,
|
|
1114
1123
|
output_format: @output_format,
|
|
1115
|
-
result_format: @result_format
|
|
1124
|
+
result_format: @result_format,
|
|
1125
|
+
cache_namespace: @cache_namespace,
|
|
1126
|
+
cache_ttl_secs: @cache_ttl_secs
|
|
1116
1127
|
}
|
|
1117
1128
|
end
|
|
1118
1129
|
|
|
@@ -1271,6 +1282,10 @@ module Kreuzberg
|
|
|
1271
1282
|
@output_format = validate_output_format(value)
|
|
1272
1283
|
when :result_format
|
|
1273
1284
|
@result_format = validate_result_format(value)
|
|
1285
|
+
when :cache_namespace
|
|
1286
|
+
@cache_namespace = value
|
|
1287
|
+
when :cache_ttl_secs
|
|
1288
|
+
@cache_ttl_secs = value&.to_i
|
|
1274
1289
|
else
|
|
1275
1290
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
1276
1291
|
end
|
|
@@ -1352,6 +1367,8 @@ module Kreuzberg
|
|
|
1352
1367
|
@max_concurrent_extractions = merged.max_concurrent_extractions
|
|
1353
1368
|
@output_format = merged.output_format
|
|
1354
1369
|
@result_format = merged.result_format
|
|
1370
|
+
@cache_namespace = merged.cache_namespace
|
|
1371
|
+
@cache_ttl_secs = merged.cache_ttl_secs
|
|
1355
1372
|
end
|
|
1356
1373
|
end
|
|
1357
1374
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -463,8 +463,9 @@ module Kreuzberg
|
|
|
463
463
|
attr_reader preset: String
|
|
464
464
|
attr_reader confidence_threshold: Float?
|
|
465
465
|
attr_reader apply_heuristics: bool
|
|
466
|
+
attr_reader table_model: String?
|
|
466
467
|
|
|
467
|
-
def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool) -> void
|
|
468
|
+
def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool, ?table_model: String?) -> void
|
|
468
469
|
def to_h: () -> Hash[Symbol, untyped]
|
|
469
470
|
end
|
|
470
471
|
|
|
@@ -478,6 +479,8 @@ module Kreuzberg
|
|
|
478
479
|
class Extraction
|
|
479
480
|
attr_reader use_cache: bool
|
|
480
481
|
attr_reader enable_quality_processing: bool
|
|
482
|
+
attr_reader cache_namespace: String?
|
|
483
|
+
attr_reader cache_ttl_secs: Integer?
|
|
481
484
|
attr_reader force_ocr: bool
|
|
482
485
|
attr_reader include_document_structure: bool
|
|
483
486
|
attr_reader ocr: OCR?
|
|
@@ -520,7 +523,9 @@ module Kreuzberg
|
|
|
520
523
|
?concurrency: (Concurrency | Hash[Symbol, untyped])?,
|
|
521
524
|
?max_concurrent_extractions: Integer?,
|
|
522
525
|
?output_format: String?,
|
|
523
|
-
?result_format: String
|
|
526
|
+
?result_format: String?,
|
|
527
|
+
?cache_namespace: String?,
|
|
528
|
+
?cache_ttl_secs: Integer?
|
|
524
529
|
) -> void
|
|
525
530
|
def to_h: () -> Hash[Symbol, untyped]
|
|
526
531
|
def to_json: (*untyped) -> String
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.5.
|
|
5
|
+
version = "4.5.4"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -15,7 +15,9 @@ ahash = "0.8.12"
|
|
|
15
15
|
anyhow = "1.0"
|
|
16
16
|
async-trait = "0.1.89"
|
|
17
17
|
base64 = "0.22.1"
|
|
18
|
+
blake3 = "1"
|
|
18
19
|
bytes = { version = "1", features = ["serde"] }
|
|
20
|
+
cfb = "0.14"
|
|
19
21
|
chrono = "0.4"
|
|
20
22
|
clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
|
|
21
23
|
console_error_panic_hook = "0.1"
|
|
@@ -24,13 +26,12 @@ ctor = "0.6"
|
|
|
24
26
|
dbase = "0.7"
|
|
25
27
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
26
28
|
hex = "0.4.3"
|
|
27
|
-
html-to-markdown-rs = { version = "2.
|
|
28
|
-
hwpers = "0.5"
|
|
29
|
+
html-to-markdown-rs = { version = "2.29.0", default-features = false }
|
|
29
30
|
image = { version = "0.25.10", default-features = false }
|
|
30
31
|
itertools = "0.14"
|
|
31
32
|
js-sys = "0.3"
|
|
32
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.5.
|
|
33
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.
|
|
33
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.5.4", default-features = false }
|
|
34
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.4" }
|
|
34
35
|
lazy_static = "1.5.0"
|
|
35
36
|
libc = "0.2.183"
|
|
36
37
|
log = "0.4"
|