kreuzberg 4.5.2 → 4.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +4 -4
- data/ext/kreuzberg_rb/native/Cargo.lock +62 -46
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -9
- data/lib/kreuzberg/config.rb +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +2 -1
- data/vendor/Cargo.toml +5 -5
- data/vendor/kreuzberg/Cargo.toml +144 -112
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/chunking/core.rs +47 -0
- data/vendor/kreuzberg/src/core/config/layout.rs +12 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +19 -0
- data/vendor/kreuzberg/src/core/mime.rs +42 -1
- data/vendor/kreuzberg/src/extraction/hwp/error.rs +54 -0
- data/vendor/kreuzberg/src/extraction/hwp/mod.rs +72 -0
- data/vendor/kreuzberg/src/extraction/hwp/model.rs +102 -0
- data/vendor/kreuzberg/src/extraction/hwp/parser.rs +174 -0
- data/vendor/kreuzberg/src/extraction/hwp/reader.rs +126 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +3 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +58 -7
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +19 -5
- data/vendor/kreuzberg/src/extractors/hwp.rs +4 -5
- data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +189 -0
- data/vendor/kreuzberg/src/extractors/iwork/mod.rs +291 -0
- data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +186 -0
- data/vendor/kreuzberg/src/extractors/iwork/pages.rs +182 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +13 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +4 -0
- data/vendor/kreuzberg/src/layout/mod.rs +133 -0
- data/vendor/kreuzberg/src/layout/model_manager.rs +61 -2
- data/vendor/kreuzberg/src/layout/models/mod.rs +2 -0
- data/vendor/kreuzberg/src/layout/models/slanet.rs +550 -0
- data/vendor/kreuzberg/src/layout/models/table_classifier.rs +219 -0
- data/vendor/kreuzberg/src/pdf/images.rs +13 -0
- data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +244 -65
- data/vendor/kreuzberg/src/pdf/markdown/regions/mod.rs +2 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/table_recognition.rs +334 -1
- data/vendor/kreuzberg/tests/epub_markdown_headings_tests.rs +177 -0
- data/vendor/kreuzberg/tests/iwork_integration.rs +220 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +14 -14
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +14 -14
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +17 -17
- data/vendor/kreuzberg-tesseract/Cargo.toml +27 -27
- metadata +15 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 47a14cc623891453596552fd893b43f18ffa04068de61b47b34b7f18ad8af890
|
|
4
|
+
data.tar.gz: 5e18a52f5acbabba2ee64790b3831c89301e58043c8b22ba7616791a2338401e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9ac2251c79bcdff41d8746ea3244bf3774cd022aa0f9c63a0153a7ec394f86d1bfd4fe3b364d175371f602006aafea17f612250b2b8368e213989b910f17940e
|
|
7
|
+
data.tar.gz: 8494619ec2253eaeb68b95fc4e204355ab2a08f71f5863f88b8305c7c07333a456f73c497804de7ca7693b1207bd0afaef3e6ca0ab5cb811eee1686823bff31d
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.5.
|
|
4
|
+
kreuzberg (4.5.4)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -222,7 +222,7 @@ CHECKSUMS
|
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
223
|
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.5.
|
|
225
|
+
kreuzberg (4.5.4)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.4" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -61,7 +61,7 @@
|
|
|
61
61
|
</div>
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
Extract text, tables, images, and metadata from
|
|
64
|
+
Extract text, tables, images, and metadata from 91+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
|
|
65
65
|
|
|
66
66
|
|
|
67
67
|
## Installation
|
|
@@ -211,9 +211,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
211
211
|
|
|
212
212
|
## Features
|
|
213
213
|
|
|
214
|
-
### Supported File Formats (
|
|
214
|
+
### Supported File Formats (91+)
|
|
215
215
|
|
|
216
|
-
|
|
216
|
+
91+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
217
217
|
|
|
218
218
|
#### Office Documents
|
|
219
219
|
|
|
@@ -1215,15 +1215,15 @@ checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2"
|
|
|
1215
1215
|
|
|
1216
1216
|
[[package]]
|
|
1217
1217
|
name = "deflate64"
|
|
1218
|
-
version = "0.1.
|
|
1218
|
+
version = "0.1.12"
|
|
1219
1219
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1220
|
-
checksum = "
|
|
1220
|
+
checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
|
|
1221
1221
|
|
|
1222
1222
|
[[package]]
|
|
1223
1223
|
name = "der"
|
|
1224
|
-
version = "0.
|
|
1224
|
+
version = "0.8.0"
|
|
1225
1225
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1226
|
-
checksum = "
|
|
1226
|
+
checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b"
|
|
1227
1227
|
dependencies = [
|
|
1228
1228
|
"pem-rfc7468",
|
|
1229
1229
|
"zeroize",
|
|
@@ -2027,7 +2027,7 @@ dependencies = [
|
|
|
2027
2027
|
"serde",
|
|
2028
2028
|
"serde_json",
|
|
2029
2029
|
"thiserror 2.0.18",
|
|
2030
|
-
"ureq 3.
|
|
2030
|
+
"ureq 3.3.0",
|
|
2031
2031
|
"windows-sys 0.61.2",
|
|
2032
2032
|
]
|
|
2033
2033
|
|
|
@@ -2057,9 +2057,9 @@ dependencies = [
|
|
|
2057
2057
|
|
|
2058
2058
|
[[package]]
|
|
2059
2059
|
name = "html-to-markdown-rs"
|
|
2060
|
-
version = "2.
|
|
2060
|
+
version = "2.29.0"
|
|
2061
2061
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2062
|
-
checksum = "
|
|
2062
|
+
checksum = "9013679b8c3600142e5a8f742748c3c38c49d9fc50675dad62f8f1721090a85a"
|
|
2063
2063
|
dependencies = [
|
|
2064
2064
|
"ahash",
|
|
2065
2065
|
"astral-tl",
|
|
@@ -2669,9 +2669,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
|
|
|
2669
2669
|
|
|
2670
2670
|
[[package]]
|
|
2671
2671
|
name = "iri-string"
|
|
2672
|
-
version = "0.7.
|
|
2672
|
+
version = "0.7.11"
|
|
2673
2673
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2674
|
-
checksum = "
|
|
2674
|
+
checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb"
|
|
2675
2675
|
dependencies = [
|
|
2676
2676
|
"memchr",
|
|
2677
2677
|
"serde",
|
|
@@ -2757,7 +2757,7 @@ dependencies = [
|
|
|
2757
2757
|
"cesu8",
|
|
2758
2758
|
"cfg-if",
|
|
2759
2759
|
"combine",
|
|
2760
|
-
"jni-sys",
|
|
2760
|
+
"jni-sys 0.3.1",
|
|
2761
2761
|
"log",
|
|
2762
2762
|
"thiserror 1.0.69",
|
|
2763
2763
|
"walkdir",
|
|
@@ -2766,9 +2766,31 @@ dependencies = [
|
|
|
2766
2766
|
|
|
2767
2767
|
[[package]]
|
|
2768
2768
|
name = "jni-sys"
|
|
2769
|
-
version = "0.3.
|
|
2769
|
+
version = "0.3.1"
|
|
2770
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2771
|
+
checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258"
|
|
2772
|
+
dependencies = [
|
|
2773
|
+
"jni-sys 0.4.1",
|
|
2774
|
+
]
|
|
2775
|
+
|
|
2776
|
+
[[package]]
|
|
2777
|
+
name = "jni-sys"
|
|
2778
|
+
version = "0.4.1"
|
|
2779
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2780
|
+
checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2"
|
|
2781
|
+
dependencies = [
|
|
2782
|
+
"jni-sys-macros",
|
|
2783
|
+
]
|
|
2784
|
+
|
|
2785
|
+
[[package]]
|
|
2786
|
+
name = "jni-sys-macros"
|
|
2787
|
+
version = "0.4.1"
|
|
2770
2788
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2771
|
-
checksum = "
|
|
2789
|
+
checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264"
|
|
2790
|
+
dependencies = [
|
|
2791
|
+
"quote",
|
|
2792
|
+
"syn",
|
|
2793
|
+
]
|
|
2772
2794
|
|
|
2773
2795
|
[[package]]
|
|
2774
2796
|
name = "jobserver"
|
|
@@ -2807,7 +2829,7 @@ dependencies = [
|
|
|
2807
2829
|
|
|
2808
2830
|
[[package]]
|
|
2809
2831
|
name = "kreuzberg"
|
|
2810
|
-
version = "4.5.
|
|
2832
|
+
version = "4.5.3"
|
|
2811
2833
|
dependencies = [
|
|
2812
2834
|
"ahash",
|
|
2813
2835
|
"async-trait",
|
|
@@ -2870,6 +2892,7 @@ dependencies = [
|
|
|
2870
2892
|
"serde_yaml_ng",
|
|
2871
2893
|
"sevenz-rust2",
|
|
2872
2894
|
"sha2",
|
|
2895
|
+
"snap",
|
|
2873
2896
|
"tar",
|
|
2874
2897
|
"text-splitter",
|
|
2875
2898
|
"thiserror 2.0.18",
|
|
@@ -2882,16 +2905,16 @@ dependencies = [
|
|
|
2882
2905
|
"tracing",
|
|
2883
2906
|
"tracing-opentelemetry",
|
|
2884
2907
|
"unicode-normalization",
|
|
2885
|
-
"ureq 3.
|
|
2908
|
+
"ureq 3.3.0",
|
|
2886
2909
|
"utoipa",
|
|
2887
2910
|
"whatlang",
|
|
2888
2911
|
"yake-rust",
|
|
2889
|
-
"zip
|
|
2912
|
+
"zip 7.2.0",
|
|
2890
2913
|
]
|
|
2891
2914
|
|
|
2892
2915
|
[[package]]
|
|
2893
2916
|
name = "kreuzberg-ffi"
|
|
2894
|
-
version = "4.5.
|
|
2917
|
+
version = "4.5.3"
|
|
2895
2918
|
dependencies = [
|
|
2896
2919
|
"ahash",
|
|
2897
2920
|
"async-trait",
|
|
@@ -2907,7 +2930,7 @@ dependencies = [
|
|
|
2907
2930
|
|
|
2908
2931
|
[[package]]
|
|
2909
2932
|
name = "kreuzberg-paddle-ocr"
|
|
2910
|
-
version = "4.5.
|
|
2933
|
+
version = "4.5.3"
|
|
2911
2934
|
dependencies = [
|
|
2912
2935
|
"geo-clipper",
|
|
2913
2936
|
"geo-types",
|
|
@@ -2921,7 +2944,7 @@ dependencies = [
|
|
|
2921
2944
|
|
|
2922
2945
|
[[package]]
|
|
2923
2946
|
name = "kreuzberg-pdfium-render"
|
|
2924
|
-
version = "4.5.
|
|
2947
|
+
version = "4.5.3"
|
|
2925
2948
|
dependencies = [
|
|
2926
2949
|
"bitflags",
|
|
2927
2950
|
"bytemuck",
|
|
@@ -2944,7 +2967,7 @@ dependencies = [
|
|
|
2944
2967
|
|
|
2945
2968
|
[[package]]
|
|
2946
2969
|
name = "kreuzberg-rb"
|
|
2947
|
-
version = "4.5.
|
|
2970
|
+
version = "4.5.3"
|
|
2948
2971
|
dependencies = [
|
|
2949
2972
|
"async-trait",
|
|
2950
2973
|
"html-to-markdown-rs",
|
|
@@ -2961,13 +2984,13 @@ dependencies = [
|
|
|
2961
2984
|
|
|
2962
2985
|
[[package]]
|
|
2963
2986
|
name = "kreuzberg-tesseract"
|
|
2964
|
-
version = "4.5.
|
|
2987
|
+
version = "4.5.3"
|
|
2965
2988
|
dependencies = [
|
|
2966
2989
|
"cc",
|
|
2967
2990
|
"cmake",
|
|
2968
2991
|
"reqwest",
|
|
2969
2992
|
"thiserror 2.0.18",
|
|
2970
|
-
"zip
|
|
2993
|
+
"zip 7.2.0",
|
|
2971
2994
|
]
|
|
2972
2995
|
|
|
2973
2996
|
[[package]]
|
|
@@ -3739,7 +3762,7 @@ dependencies = [
|
|
|
3739
3762
|
"ort-sys",
|
|
3740
3763
|
"smallvec",
|
|
3741
3764
|
"tracing",
|
|
3742
|
-
"ureq 3.
|
|
3765
|
+
"ureq 3.3.0",
|
|
3743
3766
|
]
|
|
3744
3767
|
|
|
3745
3768
|
[[package]]
|
|
@@ -3750,7 +3773,7 @@ checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90"
|
|
|
3750
3773
|
dependencies = [
|
|
3751
3774
|
"hmac-sha256",
|
|
3752
3775
|
"lzma-rust2 0.15.7",
|
|
3753
|
-
"ureq 3.
|
|
3776
|
+
"ureq 3.3.0",
|
|
3754
3777
|
]
|
|
3755
3778
|
|
|
3756
3779
|
[[package]]
|
|
@@ -3806,9 +3829,9 @@ dependencies = [
|
|
|
3806
3829
|
|
|
3807
3830
|
[[package]]
|
|
3808
3831
|
name = "pem-rfc7468"
|
|
3809
|
-
version = "0.
|
|
3832
|
+
version = "1.0.0"
|
|
3810
3833
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3811
|
-
checksum = "
|
|
3834
|
+
checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9"
|
|
3812
3835
|
dependencies = [
|
|
3813
3836
|
"base64ct",
|
|
3814
3837
|
]
|
|
@@ -4004,9 +4027,9 @@ dependencies = [
|
|
|
4004
4027
|
|
|
4005
4028
|
[[package]]
|
|
4006
4029
|
name = "pulldown-cmark"
|
|
4007
|
-
version = "0.13.
|
|
4030
|
+
version = "0.13.3"
|
|
4008
4031
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4009
|
-
checksum = "
|
|
4032
|
+
checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad"
|
|
4010
4033
|
dependencies = [
|
|
4011
4034
|
"bitflags",
|
|
4012
4035
|
"getopts",
|
|
@@ -4979,6 +5002,12 @@ version = "1.15.1"
|
|
|
4979
5002
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4980
5003
|
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
|
4981
5004
|
|
|
5005
|
+
[[package]]
|
|
5006
|
+
name = "snap"
|
|
5007
|
+
version = "1.1.1"
|
|
5008
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5009
|
+
checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
|
|
5010
|
+
|
|
4982
5011
|
[[package]]
|
|
4983
5012
|
name = "socket2"
|
|
4984
5013
|
version = "0.6.3"
|
|
@@ -5772,9 +5801,9 @@ dependencies = [
|
|
|
5772
5801
|
|
|
5773
5802
|
[[package]]
|
|
5774
5803
|
name = "ureq"
|
|
5775
|
-
version = "3.
|
|
5804
|
+
version = "3.3.0"
|
|
5776
5805
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5777
|
-
checksum = "
|
|
5806
|
+
checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
|
|
5778
5807
|
dependencies = [
|
|
5779
5808
|
"base64 0.22.1",
|
|
5780
5809
|
"cookie_store",
|
|
@@ -5796,9 +5825,9 @@ dependencies = [
|
|
|
5796
5825
|
|
|
5797
5826
|
[[package]]
|
|
5798
5827
|
name = "ureq-proto"
|
|
5799
|
-
version = "0.
|
|
5828
|
+
version = "0.6.0"
|
|
5800
5829
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5801
|
-
checksum = "
|
|
5830
|
+
checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
|
|
5802
5831
|
dependencies = [
|
|
5803
5832
|
"base64 0.22.1",
|
|
5804
5833
|
"http",
|
|
@@ -6846,19 +6875,6 @@ dependencies = [
|
|
|
6846
6875
|
"zopfli",
|
|
6847
6876
|
]
|
|
6848
6877
|
|
|
6849
|
-
[[package]]
|
|
6850
|
-
name = "zip"
|
|
6851
|
-
version = "8.3.0"
|
|
6852
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6853
|
-
checksum = "4a243cfad17427fc077f529da5a95abe4e94fd2bfdb601611870a6557cc67657"
|
|
6854
|
-
dependencies = [
|
|
6855
|
-
"crc32fast",
|
|
6856
|
-
"flate2",
|
|
6857
|
-
"indexmap",
|
|
6858
|
-
"memchr",
|
|
6859
|
-
"typed-path",
|
|
6860
|
-
]
|
|
6861
|
-
|
|
6862
6878
|
[[package]]
|
|
6863
6879
|
name = "zlib-rs"
|
|
6864
6880
|
version = "0.6.3"
|
|
@@ -6928,9 +6944,9 @@ dependencies = [
|
|
|
6928
6944
|
|
|
6929
6945
|
[[package]]
|
|
6930
6946
|
name = "zune-jpeg"
|
|
6931
|
-
version = "0.5.
|
|
6947
|
+
version = "0.5.14"
|
|
6932
6948
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6933
|
-
checksum = "
|
|
6949
|
+
checksum = "0b7a1c0af6e5d8d1363f4994b7a091ccf963d8b694f7da5b0b9cceb82da2c0a6"
|
|
6934
6950
|
dependencies = [
|
|
6935
6951
|
"zune-core",
|
|
6936
6952
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.5.
|
|
3
|
+
version = "4.5.4"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -64,7 +64,7 @@ tokio = { version = "1.50.0", features = [
|
|
|
64
64
|
"time",
|
|
65
65
|
"io-util",
|
|
66
66
|
] }
|
|
67
|
-
html-to-markdown-rs = { version = "2.
|
|
67
|
+
html-to-markdown-rs = { version = "2.29.0", default-features = false }
|
|
68
68
|
|
|
69
69
|
[dev-dependencies]
|
|
70
70
|
pretty_assertions = "1.4"
|
|
@@ -139,6 +139,12 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
|
|
|
139
139
|
|
|
140
140
|
let sizing = parse_chunk_sizing(ruby, hash)?;
|
|
141
141
|
|
|
142
|
+
let prepend_heading_context = if let Some(val) = get_kw(ruby, hash, "prepend_heading_context") {
|
|
143
|
+
bool::try_convert(val)?
|
|
144
|
+
} else {
|
|
145
|
+
false
|
|
146
|
+
};
|
|
147
|
+
|
|
142
148
|
let config = ChunkingConfig {
|
|
143
149
|
max_characters: max_chars,
|
|
144
150
|
overlap: max_overlap,
|
|
@@ -147,6 +153,7 @@ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig,
|
|
|
147
153
|
embedding,
|
|
148
154
|
preset,
|
|
149
155
|
sizing,
|
|
156
|
+
prepend_heading_context,
|
|
150
157
|
};
|
|
151
158
|
|
|
152
159
|
Ok(config)
|
|
@@ -315,12 +322,11 @@ pub fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
|
|
|
315
322
|
None
|
|
316
323
|
};
|
|
317
324
|
|
|
318
|
-
let allow_single_column_tables =
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
};
|
|
325
|
+
let allow_single_column_tables = if let Some(val) = get_kw(ruby, hash, "allow_single_column_tables") {
|
|
326
|
+
bool::try_convert(val)?
|
|
327
|
+
} else {
|
|
328
|
+
false
|
|
329
|
+
};
|
|
324
330
|
|
|
325
331
|
let config = PdfConfig {
|
|
326
332
|
extract_images,
|
|
@@ -819,10 +825,19 @@ pub fn parse_layout_detection_config(ruby: &Ruby, hash: RHash) -> Result<LayoutD
|
|
|
819
825
|
true
|
|
820
826
|
};
|
|
821
827
|
|
|
828
|
+
let table_model = if let Some(val) = get_kw(ruby, hash, "table_model")
|
|
829
|
+
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
830
|
+
{
|
|
831
|
+
Some(String::try_convert(val)?)
|
|
832
|
+
} else {
|
|
833
|
+
None
|
|
834
|
+
};
|
|
835
|
+
|
|
822
836
|
let config = LayoutDetectionConfig {
|
|
823
837
|
preset,
|
|
824
838
|
confidence_threshold,
|
|
825
839
|
apply_heuristics,
|
|
840
|
+
table_model,
|
|
826
841
|
};
|
|
827
842
|
|
|
828
843
|
Ok(config)
|
|
@@ -952,9 +967,8 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
|
|
|
952
967
|
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
953
968
|
{
|
|
954
969
|
let security_json = ruby_value_to_json(val)?;
|
|
955
|
-
let parsed: kreuzberg::extractors::security::SecurityLimits =
|
|
956
|
-
|
|
957
|
-
.map_err(|e| runtime_error(format!("Invalid security_limits: {}", e)))?;
|
|
970
|
+
let parsed: kreuzberg::extractors::security::SecurityLimits = serde_json::from_value(security_json)
|
|
971
|
+
.map_err(|e| runtime_error(format!("Invalid security_limits: {}", e)))?;
|
|
958
972
|
config.security_limits = Some(parsed);
|
|
959
973
|
}
|
|
960
974
|
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -850,19 +850,21 @@ module Kreuzberg
|
|
|
850
850
|
# )
|
|
851
851
|
#
|
|
852
852
|
class LayoutDetection
|
|
853
|
-
attr_reader :preset, :confidence_threshold, :apply_heuristics
|
|
853
|
+
attr_reader :preset, :confidence_threshold, :apply_heuristics, :table_model
|
|
854
854
|
|
|
855
|
-
def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true)
|
|
855
|
+
def initialize(preset: 'fast', confidence_threshold: nil, apply_heuristics: true, table_model: nil)
|
|
856
856
|
@preset = preset.to_s
|
|
857
857
|
@confidence_threshold = confidence_threshold&.to_f
|
|
858
858
|
@apply_heuristics = apply_heuristics ? true : false
|
|
859
|
+
@table_model = table_model&.to_s
|
|
859
860
|
end
|
|
860
861
|
|
|
861
862
|
def to_h
|
|
862
863
|
{
|
|
863
864
|
preset: @preset,
|
|
864
865
|
confidence_threshold: @confidence_threshold,
|
|
865
|
-
apply_heuristics: @apply_heuristics
|
|
866
|
+
apply_heuristics: @apply_heuristics,
|
|
867
|
+
table_model: @table_model
|
|
866
868
|
}.compact
|
|
867
869
|
end
|
|
868
870
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -463,8 +463,9 @@ module Kreuzberg
|
|
|
463
463
|
attr_reader preset: String
|
|
464
464
|
attr_reader confidence_threshold: Float?
|
|
465
465
|
attr_reader apply_heuristics: bool
|
|
466
|
+
attr_reader table_model: String?
|
|
466
467
|
|
|
467
|
-
def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool) -> void
|
|
468
|
+
def initialize: (?preset: String, ?confidence_threshold: Float?, ?apply_heuristics: bool, ?table_model: String?) -> void
|
|
468
469
|
def to_h: () -> Hash[Symbol, untyped]
|
|
469
470
|
end
|
|
470
471
|
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.5.
|
|
5
|
+
version = "4.5.4"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -17,6 +17,7 @@ async-trait = "0.1.89"
|
|
|
17
17
|
base64 = "0.22.1"
|
|
18
18
|
blake3 = "1"
|
|
19
19
|
bytes = { version = "1", features = ["serde"] }
|
|
20
|
+
cfb = "0.14"
|
|
20
21
|
chrono = "0.4"
|
|
21
22
|
clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
|
|
22
23
|
console_error_panic_hook = "0.1"
|
|
@@ -25,13 +26,12 @@ ctor = "0.6"
|
|
|
25
26
|
dbase = "0.7"
|
|
26
27
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
27
28
|
hex = "0.4.3"
|
|
28
|
-
html-to-markdown-rs = { version = "2.
|
|
29
|
-
hwpers = "0.5"
|
|
29
|
+
html-to-markdown-rs = { version = "2.29.0", default-features = false }
|
|
30
30
|
image = { version = "0.25.10", default-features = false }
|
|
31
31
|
itertools = "0.14"
|
|
32
32
|
js-sys = "0.3"
|
|
33
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.5.
|
|
34
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.
|
|
33
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.5.4", default-features = false }
|
|
34
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.5.4" }
|
|
35
35
|
lazy_static = "1.5.0"
|
|
36
36
|
libc = "0.2.183"
|
|
37
37
|
log = "0.4"
|