kreuzberg 4.6.0 → 4.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +83 -14
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -1
- data/ext/kreuzberg_rb/native/src/config/types.rs +18 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +30 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +5 -1
- data/lib/kreuzberg/config.rb +22 -8
- data/lib/kreuzberg/extraction_api.rb +37 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/render_spec.rb +91 -0
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +6 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +1 -0
- data/vendor/kreuzberg/src/api/openapi.rs +6 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +28 -0
- data/vendor/kreuzberg/src/core/config/extraction/file_config.rs +12 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +23 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -0
- data/vendor/kreuzberg/src/error.rs +3 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -0
- data/vendor/kreuzberg/src/extraction/pst.rs +387 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +214 -1
- data/vendor/kreuzberg/src/extraction/transform/content.rs +40 -7
- data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +69 -12
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +159 -6
- data/vendor/kreuzberg/src/extractors/email.rs +3 -3
- data/vendor/kreuzberg/src/extractors/mod.rs +12 -2
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +122 -53
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +562 -194
- data/vendor/kreuzberg/src/extractors/pst.rs +264 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +14 -4
- data/vendor/kreuzberg/src/mcp/errors.rs +5 -0
- data/vendor/kreuzberg/src/pdf/error.rs +18 -0
- data/vendor/kreuzberg/src/pdf/layout_runner.rs +214 -226
- data/vendor/kreuzberg/src/pdf/mod.rs +1 -1
- data/vendor/kreuzberg/src/pdf/rendering.rs +358 -32
- data/vendor/kreuzberg/src/pdf/text.rs +41 -2
- data/vendor/kreuzberg/test_documents/jsonl/simple.jsonl +3 -0
- data/vendor/kreuzberg/test_documents/jsonl/with_blanks.jsonl +5 -0
- data/vendor/kreuzberg/tests/api_consistency.rs +7 -0
- data/vendor/kreuzberg/tests/jsonl_integration.rs +82 -0
- data/vendor/kreuzberg/tests/pst_integration.rs +82 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +149 -2
- data/vendor/kreuzberg-ffi/src/config/merge.rs +7 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +37 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +6 -0
- data/vendor/kreuzberg-ffi/src/rendering.rs +325 -0
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/src/base_net.rs +15 -4
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +10 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f087c152499deb621223ba54a8ca450bb0510da3b43c880f3969e33ec4b4d5e5
|
|
4
|
+
data.tar.gz: dba38092babe378ec93e0dd4f307ab44c10519623302fdf14023a52eff6549b2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2e2cb3a3636555ac5f7d54bf473e36872efef5cf1796c667f727fa8622792a5cec009799d35c5ccb776997b7c39a0e5d0745266c7c5e5604d22b4e3e3e2f5e1f
|
|
7
|
+
data.tar.gz: b032174ab9b1366d3d05082ef6cfe3b1b700ff66d265fb28353d69b581aee6989688dd8e01b8c516f7046a8968ca86edd8a80cb72cb48477607eb8104d7facb3
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -499,6 +499,15 @@ dependencies = [
|
|
|
499
499
|
"generic-array",
|
|
500
500
|
]
|
|
501
501
|
|
|
502
|
+
[[package]]
|
|
503
|
+
name = "block-buffer"
|
|
504
|
+
version = "0.12.0"
|
|
505
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
506
|
+
checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
|
|
507
|
+
dependencies = [
|
|
508
|
+
"hybrid-array",
|
|
509
|
+
]
|
|
510
|
+
|
|
502
511
|
[[package]]
|
|
503
512
|
name = "block-padding"
|
|
504
513
|
version = "0.3.3"
|
|
@@ -726,7 +735,7 @@ version = "0.4.4"
|
|
|
726
735
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
727
736
|
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
|
|
728
737
|
dependencies = [
|
|
729
|
-
"crypto-common",
|
|
738
|
+
"crypto-common 0.1.7",
|
|
730
739
|
"inout",
|
|
731
740
|
]
|
|
732
741
|
|
|
@@ -877,6 +886,12 @@ dependencies = [
|
|
|
877
886
|
"web-sys",
|
|
878
887
|
]
|
|
879
888
|
|
|
889
|
+
[[package]]
|
|
890
|
+
name = "const-oid"
|
|
891
|
+
version = "0.10.2"
|
|
892
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
893
|
+
checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
|
|
894
|
+
|
|
880
895
|
[[package]]
|
|
881
896
|
name = "const-random"
|
|
882
897
|
version = "0.1.18"
|
|
@@ -1034,6 +1049,15 @@ dependencies = [
|
|
|
1034
1049
|
"typenum",
|
|
1035
1050
|
]
|
|
1036
1051
|
|
|
1052
|
+
[[package]]
|
|
1053
|
+
name = "crypto-common"
|
|
1054
|
+
version = "0.2.1"
|
|
1055
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1056
|
+
checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710"
|
|
1057
|
+
dependencies = [
|
|
1058
|
+
"hybrid-array",
|
|
1059
|
+
]
|
|
1060
|
+
|
|
1037
1061
|
[[package]]
|
|
1038
1062
|
name = "ctor"
|
|
1039
1063
|
version = "0.6.3"
|
|
@@ -1232,8 +1256,19 @@ version = "0.10.7"
|
|
|
1232
1256
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1233
1257
|
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
|
1234
1258
|
dependencies = [
|
|
1235
|
-
"block-buffer",
|
|
1236
|
-
"crypto-common",
|
|
1259
|
+
"block-buffer 0.10.4",
|
|
1260
|
+
"crypto-common 0.1.7",
|
|
1261
|
+
]
|
|
1262
|
+
|
|
1263
|
+
[[package]]
|
|
1264
|
+
name = "digest"
|
|
1265
|
+
version = "0.11.2"
|
|
1266
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1267
|
+
checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c"
|
|
1268
|
+
dependencies = [
|
|
1269
|
+
"block-buffer 0.12.0",
|
|
1270
|
+
"const-oid",
|
|
1271
|
+
"crypto-common 0.2.1",
|
|
1237
1272
|
]
|
|
1238
1273
|
|
|
1239
1274
|
[[package]]
|
|
@@ -2061,6 +2096,15 @@ version = "1.0.3"
|
|
|
2061
2096
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2062
2097
|
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
|
|
2063
2098
|
|
|
2099
|
+
[[package]]
|
|
2100
|
+
name = "hybrid-array"
|
|
2101
|
+
version = "0.4.8"
|
|
2102
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2103
|
+
checksum = "8655f91cd07f2b9d0c24137bd650fe69617773435ee5ec83022377777ce65ef1"
|
|
2104
|
+
dependencies = [
|
|
2105
|
+
"typenum",
|
|
2106
|
+
]
|
|
2107
|
+
|
|
2064
2108
|
[[package]]
|
|
2065
2109
|
name = "hyper"
|
|
2066
2110
|
version = "1.8.1"
|
|
@@ -2612,7 +2656,7 @@ dependencies = [
|
|
|
2612
2656
|
|
|
2613
2657
|
[[package]]
|
|
2614
2658
|
name = "kreuzberg"
|
|
2615
|
-
version = "4.6.
|
|
2659
|
+
version = "4.6.2"
|
|
2616
2660
|
dependencies = [
|
|
2617
2661
|
"ahash",
|
|
2618
2662
|
"async-trait",
|
|
@@ -2626,6 +2670,7 @@ dependencies = [
|
|
|
2626
2670
|
"calamine",
|
|
2627
2671
|
"cfb 0.14.0",
|
|
2628
2672
|
"chardetng",
|
|
2673
|
+
"chrono",
|
|
2629
2674
|
"dashmap",
|
|
2630
2675
|
"dbase",
|
|
2631
2676
|
"encoding_rs",
|
|
@@ -2658,6 +2703,7 @@ dependencies = [
|
|
|
2658
2703
|
"opentelemetry_sdk",
|
|
2659
2704
|
"org",
|
|
2660
2705
|
"ort",
|
|
2706
|
+
"outlook-pst",
|
|
2661
2707
|
"parking_lot",
|
|
2662
2708
|
"pastey 0.2.1",
|
|
2663
2709
|
"pkg-config",
|
|
@@ -2673,9 +2719,10 @@ dependencies = [
|
|
|
2673
2719
|
"serde_json",
|
|
2674
2720
|
"serde_yaml_ng",
|
|
2675
2721
|
"sevenz-rust2",
|
|
2676
|
-
"sha2",
|
|
2722
|
+
"sha2 0.11.0",
|
|
2677
2723
|
"snap",
|
|
2678
2724
|
"tar",
|
|
2725
|
+
"tempfile",
|
|
2679
2726
|
"text-splitter",
|
|
2680
2727
|
"thiserror 2.0.18",
|
|
2681
2728
|
"tiff",
|
|
@@ -2696,7 +2743,7 @@ dependencies = [
|
|
|
2696
2743
|
|
|
2697
2744
|
[[package]]
|
|
2698
2745
|
name = "kreuzberg-ffi"
|
|
2699
|
-
version = "4.6.
|
|
2746
|
+
version = "4.6.2"
|
|
2700
2747
|
dependencies = [
|
|
2701
2748
|
"ahash",
|
|
2702
2749
|
"async-trait",
|
|
@@ -2712,7 +2759,7 @@ dependencies = [
|
|
|
2712
2759
|
|
|
2713
2760
|
[[package]]
|
|
2714
2761
|
name = "kreuzberg-paddle-ocr"
|
|
2715
|
-
version = "4.6.
|
|
2762
|
+
version = "4.6.2"
|
|
2716
2763
|
dependencies = [
|
|
2717
2764
|
"geo-clipper",
|
|
2718
2765
|
"geo-types",
|
|
@@ -2726,7 +2773,7 @@ dependencies = [
|
|
|
2726
2773
|
|
|
2727
2774
|
[[package]]
|
|
2728
2775
|
name = "kreuzberg-pdfium-render"
|
|
2729
|
-
version = "4.6.
|
|
2776
|
+
version = "4.6.2"
|
|
2730
2777
|
dependencies = [
|
|
2731
2778
|
"bitflags",
|
|
2732
2779
|
"bytemuck",
|
|
@@ -2749,7 +2796,7 @@ dependencies = [
|
|
|
2749
2796
|
|
|
2750
2797
|
[[package]]
|
|
2751
2798
|
name = "kreuzberg-rb"
|
|
2752
|
-
version = "4.6.
|
|
2799
|
+
version = "4.6.2"
|
|
2753
2800
|
dependencies = [
|
|
2754
2801
|
"async-trait",
|
|
2755
2802
|
"html-to-markdown-rs",
|
|
@@ -2766,7 +2813,7 @@ dependencies = [
|
|
|
2766
2813
|
|
|
2767
2814
|
[[package]]
|
|
2768
2815
|
name = "kreuzberg-tesseract"
|
|
2769
|
-
version = "4.6.
|
|
2816
|
+
version = "4.6.2"
|
|
2770
2817
|
dependencies = [
|
|
2771
2818
|
"cc",
|
|
2772
2819
|
"cmake",
|
|
@@ -2931,7 +2978,7 @@ dependencies = [
|
|
|
2931
2978
|
"rand 0.10.0",
|
|
2932
2979
|
"rangemap",
|
|
2933
2980
|
"rayon",
|
|
2934
|
-
"sha2",
|
|
2981
|
+
"sha2 0.10.9",
|
|
2935
2982
|
"stringprep",
|
|
2936
2983
|
"thiserror 2.0.18",
|
|
2937
2984
|
"time",
|
|
@@ -3068,7 +3115,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
3068
3115
|
checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
|
|
3069
3116
|
dependencies = [
|
|
3070
3117
|
"cfg-if",
|
|
3071
|
-
"digest",
|
|
3118
|
+
"digest 0.10.7",
|
|
3072
3119
|
]
|
|
3073
3120
|
|
|
3074
3121
|
[[package]]
|
|
@@ -3540,6 +3587,17 @@ dependencies = [
|
|
|
3540
3587
|
"ureq 3.3.0",
|
|
3541
3588
|
]
|
|
3542
3589
|
|
|
3590
|
+
[[package]]
|
|
3591
|
+
name = "outlook-pst"
|
|
3592
|
+
version = "1.2.0"
|
|
3593
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3594
|
+
checksum = "299eace9d895cc31490927d7de779f8e8f3deb5fb6bd1e68eb192aa1da19bd47"
|
|
3595
|
+
dependencies = [
|
|
3596
|
+
"byteorder",
|
|
3597
|
+
"thiserror 2.0.18",
|
|
3598
|
+
"tracing",
|
|
3599
|
+
]
|
|
3600
|
+
|
|
3543
3601
|
[[package]]
|
|
3544
3602
|
name = "parking_lot"
|
|
3545
3603
|
version = "0.12.5"
|
|
@@ -4652,7 +4710,7 @@ dependencies = [
|
|
|
4652
4710
|
"js-sys",
|
|
4653
4711
|
"lzma-rust2 0.16.2",
|
|
4654
4712
|
"ppmd-rust",
|
|
4655
|
-
"sha2",
|
|
4713
|
+
"sha2 0.10.9",
|
|
4656
4714
|
"wasm-bindgen",
|
|
4657
4715
|
]
|
|
4658
4716
|
|
|
@@ -4664,7 +4722,18 @@ checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
|
|
|
4664
4722
|
dependencies = [
|
|
4665
4723
|
"cfg-if",
|
|
4666
4724
|
"cpufeatures 0.2.17",
|
|
4667
|
-
"digest",
|
|
4725
|
+
"digest 0.10.7",
|
|
4726
|
+
]
|
|
4727
|
+
|
|
4728
|
+
[[package]]
|
|
4729
|
+
name = "sha2"
|
|
4730
|
+
version = "0.11.0"
|
|
4731
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4732
|
+
checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4"
|
|
4733
|
+
dependencies = [
|
|
4734
|
+
"cfg-if",
|
|
4735
|
+
"cpufeatures 0.3.0",
|
|
4736
|
+
"digest 0.11.2",
|
|
4668
4737
|
]
|
|
4669
4738
|
|
|
4670
4739
|
[[package]]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.6.
|
|
3
|
+
version = "4.6.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -36,6 +36,7 @@ kreuzberg = { path = "../../../vendor/kreuzberg", default-features = false, feat
|
|
|
36
36
|
"chunking",
|
|
37
37
|
"chunking-tokenizers",
|
|
38
38
|
"embeddings",
|
|
39
|
+
"ort-bundled",
|
|
39
40
|
"quality",
|
|
40
41
|
"keywords",
|
|
41
42
|
"api",
|
|
@@ -875,6 +875,17 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
|
|
|
875
875
|
config.force_ocr = bool::try_convert(val)?;
|
|
876
876
|
}
|
|
877
877
|
|
|
878
|
+
if let Some(val) = get_kw(ruby, hash, "force_ocr_pages")
|
|
879
|
+
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
880
|
+
{
|
|
881
|
+
let pages_array = magnus::RArray::try_convert(val)?;
|
|
882
|
+
let pages: Vec<usize> = pages_array
|
|
883
|
+
.into_iter()
|
|
884
|
+
.map(|v| usize::try_convert(v))
|
|
885
|
+
.collect::<Result<Vec<_>, _>>()?;
|
|
886
|
+
config.force_ocr_pages = Some(pages);
|
|
887
|
+
}
|
|
888
|
+
|
|
878
889
|
if let Some(val) = get_kw(ruby, hash, "include_document_structure") {
|
|
879
890
|
config.include_document_structure = bool::try_convert(val)?;
|
|
880
891
|
}
|
|
@@ -1013,6 +1024,13 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
|
|
|
1013
1024
|
}
|
|
1014
1025
|
};
|
|
1015
1026
|
}
|
|
1027
|
+
|
|
1028
|
+
if let Some(val) = get_kw(ruby, hash, "extraction_timeout_secs")
|
|
1029
|
+
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
1030
|
+
{
|
|
1031
|
+
let secs = u64::try_convert(val)?;
|
|
1032
|
+
config.extraction_timeout_secs = Some(secs);
|
|
1033
|
+
}
|
|
1016
1034
|
}
|
|
1017
1035
|
|
|
1018
1036
|
Ok(config)
|
|
@@ -58,6 +58,36 @@ pub fn extract_file(args: &[Value]) -> Result<RHash, Error> {
|
|
|
58
58
|
extraction_result_to_ruby(&ruby, result)
|
|
59
59
|
}
|
|
60
60
|
|
|
61
|
+
/// Iterate over PDF pages, yielding (page_index, png_bytes) per page to a Ruby block.
|
|
62
|
+
pub fn render_pdf_pages_iter(path: String, dpi: i32) -> Result<(), Error> {
|
|
63
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
64
|
+
let dpi_opt = if dpi <= 0 { None } else { Some(dpi) };
|
|
65
|
+
|
|
66
|
+
let iter = kreuzberg::pdf::PdfPageIterator::from_file(&path, dpi_opt, None)
|
|
67
|
+
.map_err(|e| kreuzberg_error(e.into()))?;
|
|
68
|
+
|
|
69
|
+
for result in iter {
|
|
70
|
+
let (page_index, png_bytes) = result.map_err(|e| kreuzberg_error(e.into()))?;
|
|
71
|
+
let rb_index = ruby.integer_from_i64(page_index as i64);
|
|
72
|
+
let rb_bytes = ruby.str_from_slice(&png_bytes);
|
|
73
|
+
let _: magnus::Value = ruby.yield_values((rb_index, rb_bytes))?;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
Ok(())
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/// Render a single PDF page to PNG bytes.
|
|
80
|
+
pub fn native_render_pdf_page(path: String, page_index: i64, dpi: i64) -> Result<Vec<u8>, Error> {
|
|
81
|
+
if page_index < 0 {
|
|
82
|
+
return Err(crate::error_handling::runtime_error("page_index must be non-negative"));
|
|
83
|
+
}
|
|
84
|
+
let pdf_bytes = std::fs::read(&path)
|
|
85
|
+
.map_err(|e| crate::error_handling::runtime_error(format!("Failed to read file: {}", e)))?;
|
|
86
|
+
let dpi_opt = if dpi <= 0 { None } else { Some(dpi as i32) };
|
|
87
|
+
kreuzberg::pdf::render_pdf_page_to_png(&pdf_bytes, page_index as usize, dpi_opt, None)
|
|
88
|
+
.map_err(|e| kreuzberg_error(e.into()))
|
|
89
|
+
}
|
|
90
|
+
|
|
61
91
|
/// Extract content from bytes (asynchronous)
|
|
62
92
|
pub fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
|
|
63
93
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
@@ -23,7 +23,7 @@ pub use gc_guarded_value::GcGuardedValue;
|
|
|
23
23
|
pub use helpers::{get_kw, set_hash_entry, json_value_to_ruby, ruby_value_to_json, cache_root_dir, cache_directories};
|
|
24
24
|
pub use config::parse_extraction_config;
|
|
25
25
|
pub use result::extraction_result_to_ruby;
|
|
26
|
-
pub use extraction::{extract_file_sync, extract_bytes_sync, extract_file, extract_bytes};
|
|
26
|
+
pub use extraction::{extract_file_sync, extract_bytes_sync, extract_file, extract_bytes, render_pdf_pages_iter, native_render_pdf_page};
|
|
27
27
|
pub use batch::{
|
|
28
28
|
batch_extract_files_sync, batch_extract_bytes_sync, batch_extract_files, batch_extract_bytes,
|
|
29
29
|
};
|
|
@@ -442,6 +442,10 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
442
442
|
module.define_module_function("batch_extract_files", function!(batch_extract_files, -1))?;
|
|
443
443
|
module.define_module_function("batch_extract_bytes", function!(batch_extract_bytes, -1))?;
|
|
444
444
|
|
|
445
|
+
// PDF page iterator
|
|
446
|
+
module.define_module_function("native_render_pdf_pages_iter", function!(render_pdf_pages_iter, 2))?;
|
|
447
|
+
module.define_module_function("native_render_pdf_page", function!(native_render_pdf_page, 3))?;
|
|
448
|
+
|
|
445
449
|
// Cache functions
|
|
446
450
|
module.define_module_function("clear_cache", function!(ruby_clear_cache, 0))?;
|
|
447
451
|
module.define_module_function("cache_stats", function!(ruby_cache_stats, 0))?;
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -926,14 +926,14 @@ module Kreuzberg
|
|
|
926
926
|
# )
|
|
927
927
|
#
|
|
928
928
|
class Extraction
|
|
929
|
-
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
929
|
+
attr_reader :use_cache, :enable_quality_processing, :force_ocr, :force_ocr_pages,
|
|
930
930
|
:include_document_structure,
|
|
931
931
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
932
932
|
:images, :postprocessor,
|
|
933
933
|
:token_reduction, :keywords, :html_options, :pages,
|
|
934
934
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
935
935
|
:security_limits, :layout, :concurrency,
|
|
936
|
-
:cache_namespace, :cache_ttl_secs
|
|
936
|
+
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
|
|
937
937
|
|
|
938
938
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
939
939
|
alias image_extraction images
|
|
@@ -954,11 +954,11 @@ module Kreuzberg
|
|
|
954
954
|
#
|
|
955
955
|
# Keys that are allowed in the Extraction config
|
|
956
956
|
ALLOWED_KEYS = %i[
|
|
957
|
-
use_cache enable_quality_processing force_ocr include_document_structure ocr chunking
|
|
957
|
+
use_cache enable_quality_processing force_ocr force_ocr_pages include_document_structure ocr chunking
|
|
958
958
|
language_detection pdf_options image_extraction
|
|
959
959
|
postprocessor token_reduction keywords html_options pages
|
|
960
960
|
max_concurrent_extractions output_format result_format
|
|
961
|
-
security_limits layout concurrency cache_namespace cache_ttl_secs
|
|
961
|
+
security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
|
|
962
962
|
].freeze
|
|
963
963
|
|
|
964
964
|
# Aliases for backward compatibility
|
|
@@ -1019,6 +1019,7 @@ module Kreuzberg
|
|
|
1019
1019
|
use_cache: true,
|
|
1020
1020
|
enable_quality_processing: true,
|
|
1021
1021
|
force_ocr: false,
|
|
1022
|
+
force_ocr_pages: nil,
|
|
1022
1023
|
include_document_structure: false,
|
|
1023
1024
|
ocr: nil,
|
|
1024
1025
|
chunking: nil,
|
|
@@ -1037,10 +1038,12 @@ module Kreuzberg
|
|
|
1037
1038
|
layout: nil,
|
|
1038
1039
|
concurrency: nil,
|
|
1039
1040
|
cache_namespace: nil,
|
|
1040
|
-
cache_ttl_secs: nil
|
|
1041
|
+
cache_ttl_secs: nil,
|
|
1042
|
+
extraction_timeout_secs: nil)
|
|
1041
1043
|
kwargs = {
|
|
1042
1044
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1043
|
-
force_ocr: force_ocr,
|
|
1045
|
+
force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
|
|
1046
|
+
include_document_structure: include_document_structure,
|
|
1044
1047
|
ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
1045
1048
|
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
1046
1049
|
postprocessor: postprocessor,
|
|
@@ -1050,7 +1053,8 @@ module Kreuzberg
|
|
|
1050
1053
|
security_limits: security_limits, layout: layout,
|
|
1051
1054
|
concurrency: concurrency,
|
|
1052
1055
|
cache_namespace: cache_namespace,
|
|
1053
|
-
cache_ttl_secs: cache_ttl_secs
|
|
1056
|
+
cache_ttl_secs: cache_ttl_secs,
|
|
1057
|
+
extraction_timeout_secs: extraction_timeout_secs
|
|
1054
1058
|
}
|
|
1055
1059
|
extracted = extract_from_hash(hash, kwargs)
|
|
1056
1060
|
|
|
@@ -1068,6 +1072,7 @@ module Kreuzberg
|
|
|
1068
1072
|
@use_cache = params[:use_cache] ? true : false
|
|
1069
1073
|
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
1070
1074
|
@force_ocr = params[:force_ocr] ? true : false
|
|
1075
|
+
@force_ocr_pages = params[:force_ocr_pages]
|
|
1071
1076
|
@include_document_structure = params[:include_document_structure] ? true : false
|
|
1072
1077
|
@ocr = normalize_config(params[:ocr], OCR)
|
|
1073
1078
|
@chunking = normalize_config(params[:chunking], Chunking)
|
|
@@ -1086,6 +1091,7 @@ module Kreuzberg
|
|
|
1086
1091
|
@result_format = validate_result_format(params[:result_format])
|
|
1087
1092
|
@cache_namespace = params[:cache_namespace]
|
|
1088
1093
|
@cache_ttl_secs = params[:cache_ttl_secs]&.to_i
|
|
1094
|
+
@extraction_timeout_secs = params[:extraction_timeout_secs]&.to_i
|
|
1089
1095
|
@security_limits = params[:security_limits]
|
|
1090
1096
|
end
|
|
1091
1097
|
|
|
@@ -1118,12 +1124,14 @@ module Kreuzberg
|
|
|
1118
1124
|
use_cache: @use_cache,
|
|
1119
1125
|
enable_quality_processing: @enable_quality_processing,
|
|
1120
1126
|
force_ocr: @force_ocr,
|
|
1127
|
+
force_ocr_pages: @force_ocr_pages,
|
|
1121
1128
|
include_document_structure: @include_document_structure,
|
|
1122
1129
|
max_concurrent_extractions: @max_concurrent_extractions,
|
|
1123
1130
|
output_format: @output_format,
|
|
1124
1131
|
result_format: @result_format,
|
|
1125
1132
|
cache_namespace: @cache_namespace,
|
|
1126
|
-
cache_ttl_secs: @cache_ttl_secs
|
|
1133
|
+
cache_ttl_secs: @cache_ttl_secs,
|
|
1134
|
+
extraction_timeout_secs: @extraction_timeout_secs
|
|
1127
1135
|
}
|
|
1128
1136
|
end
|
|
1129
1137
|
|
|
@@ -1250,6 +1258,8 @@ module Kreuzberg
|
|
|
1250
1258
|
@enable_quality_processing = value ? true : false
|
|
1251
1259
|
when :force_ocr
|
|
1252
1260
|
@force_ocr = value ? true : false
|
|
1261
|
+
when :force_ocr_pages
|
|
1262
|
+
@force_ocr_pages = value
|
|
1253
1263
|
when :include_document_structure
|
|
1254
1264
|
@include_document_structure = value ? true : false
|
|
1255
1265
|
when :ocr
|
|
@@ -1286,6 +1296,8 @@ module Kreuzberg
|
|
|
1286
1296
|
@cache_namespace = value
|
|
1287
1297
|
when :cache_ttl_secs
|
|
1288
1298
|
@cache_ttl_secs = value&.to_i
|
|
1299
|
+
when :extraction_timeout_secs
|
|
1300
|
+
@extraction_timeout_secs = value&.to_i
|
|
1289
1301
|
else
|
|
1290
1302
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
1291
1303
|
end
|
|
@@ -1345,6 +1357,7 @@ module Kreuzberg
|
|
|
1345
1357
|
@use_cache = merged.use_cache
|
|
1346
1358
|
@enable_quality_processing = merged.enable_quality_processing
|
|
1347
1359
|
@force_ocr = merged.force_ocr
|
|
1360
|
+
@force_ocr_pages = merged.force_ocr_pages
|
|
1348
1361
|
@include_document_structure = merged.include_document_structure
|
|
1349
1362
|
@ocr = merged.ocr
|
|
1350
1363
|
@chunking = merged.chunking
|
|
@@ -1369,6 +1382,7 @@ module Kreuzberg
|
|
|
1369
1382
|
@result_format = merged.result_format
|
|
1370
1383
|
@cache_namespace = merged.cache_namespace
|
|
1371
1384
|
@cache_ttl_secs = merged.cache_ttl_secs
|
|
1385
|
+
@extraction_timeout_secs = merged.extraction_timeout_secs
|
|
1372
1386
|
end
|
|
1373
1387
|
end
|
|
1374
1388
|
end
|
|
@@ -319,6 +319,43 @@ module Kreuzberg
|
|
|
319
319
|
results
|
|
320
320
|
end
|
|
321
321
|
|
|
322
|
+
# Render a single PDF page as a PNG image.
|
|
323
|
+
#
|
|
324
|
+
# @param path [String, Pathname] Path to the PDF file
|
|
325
|
+
# @param page_index [Integer] Zero-based page index
|
|
326
|
+
# @param dpi [Integer] Rendering resolution (default 150)
|
|
327
|
+
# @return [String] PNG-encoded binary string
|
|
328
|
+
# @raise [Errors::IOError] If the file cannot be read
|
|
329
|
+
# @raise [Errors::ParsingError] If rendering fails
|
|
330
|
+
def render_pdf_page(path, page_index, dpi: 150)
|
|
331
|
+
path_str = path.to_s
|
|
332
|
+
raise ArgumentError, 'page_index must be non-negative' if page_index.negative?
|
|
333
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
334
|
+
|
|
335
|
+
native_render_pdf_page(path_str, page_index, dpi)
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
# Iterate over pages of a PDF lazily, yielding each page as it is rendered.
|
|
339
|
+
#
|
|
340
|
+
# Each page is rendered via the native FFI iterator, so only one page is in
|
|
341
|
+
# memory at a time.
|
|
342
|
+
#
|
|
343
|
+
# @param path [String, Pathname] Path to the PDF file
|
|
344
|
+
# @param dpi [Integer] Rendering resolution (default 150)
|
|
345
|
+
# @yieldparam page_index [Integer] Zero-based page index
|
|
346
|
+
# @yieldparam png_bytes [String] PNG-encoded binary string for the page
|
|
347
|
+
# @return [Enumerator] if no block is given
|
|
348
|
+
# @raise [Errors::IOError] If the file cannot be read
|
|
349
|
+
# @raise [Errors::ParsingError] If rendering fails
|
|
350
|
+
def render_pdf_pages_iter(path, dpi: 150, &block)
|
|
351
|
+
path_str = path.to_s
|
|
352
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
353
|
+
|
|
354
|
+
return enum_for(:render_pdf_pages_iter, path, dpi: dpi) unless block
|
|
355
|
+
|
|
356
|
+
native_render_pdf_pages_iter(path_str, dpi, &block)
|
|
357
|
+
end
|
|
358
|
+
|
|
322
359
|
def normalize_config(config)
|
|
323
360
|
return {} if config.nil?
|
|
324
361
|
return config if config.is_a?(Hash)
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -481,7 +481,9 @@ module Kreuzberg
|
|
|
481
481
|
attr_reader enable_quality_processing: bool
|
|
482
482
|
attr_reader cache_namespace: String?
|
|
483
483
|
attr_reader cache_ttl_secs: Integer?
|
|
484
|
+
attr_reader extraction_timeout_secs: Integer?
|
|
484
485
|
attr_reader force_ocr: bool
|
|
486
|
+
attr_reader force_ocr_pages: Array[Integer]?
|
|
485
487
|
attr_reader include_document_structure: bool
|
|
486
488
|
attr_reader ocr: OCR?
|
|
487
489
|
attr_reader chunking: Chunking?
|
|
@@ -508,6 +510,7 @@ module Kreuzberg
|
|
|
508
510
|
?use_cache: bool,
|
|
509
511
|
?enable_quality_processing: bool,
|
|
510
512
|
?force_ocr: bool,
|
|
513
|
+
?force_ocr_pages: Array[Integer]?,
|
|
511
514
|
?include_document_structure: bool,
|
|
512
515
|
?ocr: (OCR | Hash[Symbol, untyped])?,
|
|
513
516
|
?chunking: (Chunking | Hash[Symbol, untyped])?,
|
|
@@ -525,7 +528,8 @@ module Kreuzberg
|
|
|
525
528
|
?output_format: String?,
|
|
526
529
|
?result_format: String?,
|
|
527
530
|
?cache_namespace: String?,
|
|
528
|
-
?cache_ttl_secs: Integer
|
|
531
|
+
?cache_ttl_secs: Integer?,
|
|
532
|
+
?extraction_timeout_secs: Integer?
|
|
529
533
|
) -> void
|
|
530
534
|
def to_h: () -> Hash[Symbol, untyped]
|
|
531
535
|
def to_json: (*untyped) -> String
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Hand-written binding-specific edge case tests for PDF rendering.
|
|
2
|
+
# Happy-path render tests are auto-generated from fixtures in e2e/.
|
|
3
|
+
# These tests cover error handling, validation, and lifecycle patterns
|
|
4
|
+
# that vary per language and can't be generated uniformly.
|
|
5
|
+
|
|
6
|
+
# frozen_string_literal: true
|
|
7
|
+
|
|
8
|
+
require 'spec_helper'
|
|
9
|
+
|
|
10
|
+
RSpec.describe 'PDF Rendering' do
|
|
11
|
+
it 'exposes rendering methods' do
|
|
12
|
+
expect(Kreuzberg).to respond_to(:render_pdf_page)
|
|
13
|
+
expect(Kreuzberg).to respond_to(:render_pdf_pages_iter)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
describe '.render_pdf_page' do
|
|
17
|
+
it 'raises an error for a nonexistent file' do
|
|
18
|
+
expect do
|
|
19
|
+
Kreuzberg.render_pdf_page('/nonexistent/path/to/document.pdf', 0)
|
|
20
|
+
end.to raise_error(Kreuzberg::Errors::IOError)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it 'raises an error for an out-of-bounds page index' do
|
|
24
|
+
pdf_path = test_document_path('pdf/tiny.pdf')
|
|
25
|
+
skip 'Test PDF not available' unless File.exist?(pdf_path)
|
|
26
|
+
|
|
27
|
+
expect do
|
|
28
|
+
Kreuzberg.render_pdf_page(pdf_path, 9999)
|
|
29
|
+
end.to raise_error(StandardError)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
describe '.render_pdf_page with negative index' do
|
|
34
|
+
it 'raises ArgumentError for a negative page index' do
|
|
35
|
+
pdf_path = test_document_path('pdf/tiny.pdf')
|
|
36
|
+
skip 'Test PDF not available' unless File.exist?(pdf_path)
|
|
37
|
+
|
|
38
|
+
expect do
|
|
39
|
+
Kreuzberg.render_pdf_page(pdf_path, -1)
|
|
40
|
+
end.to raise_error(ArgumentError)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
describe '.render_pdf_pages_iter' do
|
|
45
|
+
it 'raises an error for a nonexistent file' do
|
|
46
|
+
expect do
|
|
47
|
+
Kreuzberg.render_pdf_pages_iter('/nonexistent/path/to/document.pdf') { |_, _| nil }
|
|
48
|
+
end.to raise_error(Kreuzberg::Errors::IOError)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
describe '.render_pdf_page with empty path' do
|
|
53
|
+
it 'raises an error for an empty path' do
|
|
54
|
+
expect do
|
|
55
|
+
Kreuzberg.render_pdf_page('', 0)
|
|
56
|
+
end.to raise_error(StandardError)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
describe '.render_pdf_pages_iter cleanup' do
|
|
61
|
+
it 'handles iterator cleanup without fully consuming' do
|
|
62
|
+
pdf_path = test_document_path('pdf/tiny.pdf')
|
|
63
|
+
skip 'Test PDF not available' unless File.exist?(pdf_path)
|
|
64
|
+
|
|
65
|
+
# Iterate but stop immediately — no crash
|
|
66
|
+
Kreuzberg.render_pdf_pages_iter(pdf_path) do |_page_index, _png_data|
|
|
67
|
+
break
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
describe '.render_pdf_pages_iter early termination' do
|
|
73
|
+
it 'returns valid PNG for the first page then stops' do
|
|
74
|
+
pdf_path = test_document_path('pdf/tiny.pdf')
|
|
75
|
+
skip 'Test PDF not available' unless File.exist?(pdf_path)
|
|
76
|
+
|
|
77
|
+
first_png = nil
|
|
78
|
+
Kreuzberg.render_pdf_pages_iter(pdf_path) do |page_index, png_data|
|
|
79
|
+
expect(page_index).to eq(0)
|
|
80
|
+
expect(png_data).to be_a(String)
|
|
81
|
+
expect(png_data.bytesize).to be > 8
|
|
82
|
+
# PNG magic bytes
|
|
83
|
+
expect(png_data.bytes[0..3]).to eq([0x89, 0x50, 0x4E, 0x47])
|
|
84
|
+
first_png = png_data
|
|
85
|
+
break
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
expect(first_png).not_to be_nil
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.6.
|
|
5
|
+
version = "4.6.2"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -30,8 +30,8 @@ html-to-markdown-rs = { version = "2.29.0", default-features = false }
|
|
|
30
30
|
image = { version = "0.25.10", default-features = false }
|
|
31
31
|
itertools = "0.14"
|
|
32
32
|
js-sys = "0.3"
|
|
33
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.6.
|
|
34
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.
|
|
33
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.6.2", default-features = false }
|
|
34
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.2" }
|
|
35
35
|
lazy_static = "1.5.0"
|
|
36
36
|
libc = "0.2.183"
|
|
37
37
|
log = "0.4"
|