kreuzberg 4.8.4 → 4.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -3
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -26
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
- data/ext/kreuzberg_rb/native/src/result.rs +39 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +85 -15
- data/vendor/Cargo.toml +5 -5
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/handlers.rs +1 -1
- data/vendor/kreuzberg/src/chunking/core.rs +114 -9
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +4 -1
- data/vendor/kreuzberg/src/embeddings/mod.rs +4 -1
- data/vendor/kreuzberg/src/llm/mod.rs +2 -0
- data/vendor/kreuzberg/src/llm/structured.rs +7 -3
- data/vendor/kreuzberg/src/llm/usage.rs +40 -0
- data/vendor/kreuzberg/src/llm/vlm_embeddings.rs +5 -3
- data/vendor/kreuzberg/src/llm/vlm_ocr.rs +6 -3
- data/vendor/kreuzberg/src/mcp/server.rs +1 -1
- data/vendor/kreuzberg/src/types/extraction.rs +41 -0
- data/vendor/kreuzberg/tests/llm_integration.rs +7 -7
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +63 -13
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +6 -3
- data/vendor/kreuzberg-ffi/src/memory.rs +30 -11
- data/vendor/kreuzberg-ffi/src/result.rs +71 -0
- data/vendor/kreuzberg-ffi/src/types.rs +19 -16
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 198e3a74bbf86420f51a7d5b8f80e5e110cc93f2e144fadfe129f8f90155fe40
|
|
4
|
+
data.tar.gz: e54cfaf29ae9b910f5cc365a8095298e312ef93098c185dafa497b80a5f5859b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e5d6e2d8a21b12d6984bd4fc3d4cdc7858b46d1d21e09e238c832da830f8329af93f33ac6997590a342b52f68c7448aa68d68b8a323397116682e1373be28d38
|
|
7
|
+
data.tar.gz: d5a7c6df3f4a90f76e8554fffcb3cba45e274521f2efc96ecf4c882228b8398ed4ec0cb3322ccf1090b615b3ecae5b19b8f947e779fb3bf1a72c6e101d4184f1
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.5" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -39,10 +39,13 @@
|
|
|
39
39
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
|
40
40
|
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
|
|
41
41
|
</a>
|
|
42
|
+
<a href="https://artifacthub.io/packages/search?repo=kreuzberg">
|
|
43
|
+
<img src="https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/kreuzberg" alt="Artifact Hub">
|
|
44
|
+
</a>
|
|
42
45
|
|
|
43
46
|
<!-- Project Info -->
|
|
44
47
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
45
|
-
<img src="https://img.shields.io/badge/License-
|
|
48
|
+
<img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
|
|
46
49
|
</a>
|
|
47
50
|
<a href="https://docs.kreuzberg.dev">
|
|
48
51
|
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
|
|
@@ -419,7 +422,7 @@ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg
|
|
|
419
422
|
|
|
420
423
|
## License
|
|
421
424
|
|
|
422
|
-
|
|
425
|
+
MIT License - see LICENSE file for details.
|
|
423
426
|
|
|
424
427
|
## Support
|
|
425
428
|
|
|
@@ -326,9 +326,9 @@ dependencies = [
|
|
|
326
326
|
|
|
327
327
|
[[package]]
|
|
328
328
|
name = "axum"
|
|
329
|
-
version = "0.8.
|
|
329
|
+
version = "0.8.9"
|
|
330
330
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
331
|
-
checksum = "
|
|
331
|
+
checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90"
|
|
332
332
|
dependencies = [
|
|
333
333
|
"axum-core",
|
|
334
334
|
"axum-macros",
|
|
@@ -380,9 +380,9 @@ dependencies = [
|
|
|
380
380
|
|
|
381
381
|
[[package]]
|
|
382
382
|
name = "axum-macros"
|
|
383
|
-
version = "0.5.
|
|
383
|
+
version = "0.5.1"
|
|
384
384
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
385
|
-
checksum = "
|
|
385
|
+
checksum = "7aa268c23bfbbd2c4363b9cd302a4f504fb2a9dfe7e3451d66f35dd392e20aca"
|
|
386
386
|
dependencies = [
|
|
387
387
|
"proc-macro2",
|
|
388
388
|
"quote",
|
|
@@ -766,7 +766,7 @@ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
|
|
|
766
766
|
dependencies = [
|
|
767
767
|
"cfg-if",
|
|
768
768
|
"cpufeatures 0.3.0",
|
|
769
|
-
"rand_core 0.10.
|
|
769
|
+
"rand_core 0.10.1",
|
|
770
770
|
]
|
|
771
771
|
|
|
772
772
|
[[package]]
|
|
@@ -1858,7 +1858,7 @@ dependencies = [
|
|
|
1858
1858
|
"js-sys",
|
|
1859
1859
|
"libc",
|
|
1860
1860
|
"r-efi 6.0.0",
|
|
1861
|
-
"rand_core 0.10.
|
|
1861
|
+
"rand_core 0.10.1",
|
|
1862
1862
|
"wasip2",
|
|
1863
1863
|
"wasip3",
|
|
1864
1864
|
"wasm-bindgen",
|
|
@@ -2098,7 +2098,7 @@ dependencies = [
|
|
|
2098
2098
|
"indicatif 0.17.11",
|
|
2099
2099
|
"libc",
|
|
2100
2100
|
"log",
|
|
2101
|
-
"rand 0.9.
|
|
2101
|
+
"rand 0.9.4",
|
|
2102
2102
|
"serde",
|
|
2103
2103
|
"serde_json",
|
|
2104
2104
|
"thiserror 2.0.18",
|
|
@@ -2117,7 +2117,7 @@ dependencies = [
|
|
|
2117
2117
|
"indicatif 0.18.4",
|
|
2118
2118
|
"libc",
|
|
2119
2119
|
"log",
|
|
2120
|
-
"rand 0.9.
|
|
2120
|
+
"rand 0.9.4",
|
|
2121
2121
|
"serde",
|
|
2122
2122
|
"serde_json",
|
|
2123
2123
|
"thiserror 2.0.18",
|
|
@@ -2529,7 +2529,7 @@ dependencies = [
|
|
|
2529
2529
|
"itertools 0.14.0",
|
|
2530
2530
|
"nalgebra",
|
|
2531
2531
|
"num",
|
|
2532
|
-
"rand 0.9.
|
|
2532
|
+
"rand 0.9.4",
|
|
2533
2533
|
"rand_distr",
|
|
2534
2534
|
]
|
|
2535
2535
|
|
|
@@ -2931,7 +2931,7 @@ dependencies = [
|
|
|
2931
2931
|
|
|
2932
2932
|
[[package]]
|
|
2933
2933
|
name = "kreuzberg-rb"
|
|
2934
|
-
version = "4.8.
|
|
2934
|
+
version = "4.8.5"
|
|
2935
2935
|
dependencies = [
|
|
2936
2936
|
"async-trait",
|
|
2937
2937
|
"html-to-markdown-rs",
|
|
@@ -3138,9 +3138,9 @@ dependencies = [
|
|
|
3138
3138
|
|
|
3139
3139
|
[[package]]
|
|
3140
3140
|
name = "lru"
|
|
3141
|
-
version = "0.16.
|
|
3141
|
+
version = "0.16.4"
|
|
3142
3142
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3143
|
-
checksum = "
|
|
3143
|
+
checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
|
|
3144
3144
|
dependencies = [
|
|
3145
3145
|
"hashbrown 0.16.1",
|
|
3146
3146
|
]
|
|
@@ -3707,7 +3707,7 @@ dependencies = [
|
|
|
3707
3707
|
"futures-util",
|
|
3708
3708
|
"opentelemetry",
|
|
3709
3709
|
"percent-encoding",
|
|
3710
|
-
"rand 0.9.
|
|
3710
|
+
"rand 0.9.4",
|
|
3711
3711
|
"thiserror 2.0.18",
|
|
3712
3712
|
"tokio",
|
|
3713
3713
|
"tokio-stream",
|
|
@@ -4074,7 +4074,7 @@ dependencies = [
|
|
|
4074
4074
|
"bytes",
|
|
4075
4075
|
"getrandom 0.3.4",
|
|
4076
4076
|
"lru-slab",
|
|
4077
|
-
"rand 0.9.
|
|
4077
|
+
"rand 0.9.4",
|
|
4078
4078
|
"ring",
|
|
4079
4079
|
"rustc-hash",
|
|
4080
4080
|
"rustls",
|
|
@@ -4140,9 +4140,9 @@ dependencies = [
|
|
|
4140
4140
|
|
|
4141
4141
|
[[package]]
|
|
4142
4142
|
name = "rand"
|
|
4143
|
-
version = "0.9.
|
|
4143
|
+
version = "0.9.4"
|
|
4144
4144
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4145
|
-
checksum = "
|
|
4145
|
+
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
|
|
4146
4146
|
dependencies = [
|
|
4147
4147
|
"rand_chacha",
|
|
4148
4148
|
"rand_core 0.9.5",
|
|
@@ -4156,7 +4156,7 @@ checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
|
|
|
4156
4156
|
dependencies = [
|
|
4157
4157
|
"chacha20",
|
|
4158
4158
|
"getrandom 0.4.2",
|
|
4159
|
-
"rand_core 0.10.
|
|
4159
|
+
"rand_core 0.10.1",
|
|
4160
4160
|
]
|
|
4161
4161
|
|
|
4162
4162
|
[[package]]
|
|
@@ -4180,9 +4180,9 @@ dependencies = [
|
|
|
4180
4180
|
|
|
4181
4181
|
[[package]]
|
|
4182
4182
|
name = "rand_core"
|
|
4183
|
-
version = "0.10.
|
|
4183
|
+
version = "0.10.1"
|
|
4184
4184
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4185
|
-
checksum = "
|
|
4185
|
+
checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69"
|
|
4186
4186
|
|
|
4187
4187
|
[[package]]
|
|
4188
4188
|
name = "rand_distr"
|
|
@@ -4191,7 +4191,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
4191
4191
|
checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
|
|
4192
4192
|
dependencies = [
|
|
4193
4193
|
"num-traits",
|
|
4194
|
-
"rand 0.9.
|
|
4194
|
+
"rand 0.9.4",
|
|
4195
4195
|
]
|
|
4196
4196
|
|
|
4197
4197
|
[[package]]
|
|
@@ -4227,7 +4227,7 @@ dependencies = [
|
|
|
4227
4227
|
"num-traits",
|
|
4228
4228
|
"paste",
|
|
4229
4229
|
"profiling",
|
|
4230
|
-
"rand 0.9.
|
|
4230
|
+
"rand 0.9.4",
|
|
4231
4231
|
"rand_chacha",
|
|
4232
4232
|
"simd_helpers",
|
|
4233
4233
|
"thiserror 2.0.18",
|
|
@@ -4258,9 +4258,9 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
|
|
|
4258
4258
|
|
|
4259
4259
|
[[package]]
|
|
4260
4260
|
name = "rayon"
|
|
4261
|
-
version = "1.
|
|
4261
|
+
version = "1.12.0"
|
|
4262
4262
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4263
|
-
checksum = "
|
|
4263
|
+
checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
|
|
4264
4264
|
dependencies = [
|
|
4265
4265
|
"either",
|
|
4266
4266
|
"rayon-core",
|
|
@@ -4625,9 +4625,9 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
|
|
|
4625
4625
|
|
|
4626
4626
|
[[package]]
|
|
4627
4627
|
name = "rustls-webpki"
|
|
4628
|
-
version = "0.103.
|
|
4628
|
+
version = "0.103.12"
|
|
4629
4629
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4630
|
-
checksum = "
|
|
4630
|
+
checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06"
|
|
4631
4631
|
dependencies = [
|
|
4632
4632
|
"aws-lc-rs",
|
|
4633
4633
|
"ring",
|
|
@@ -5378,7 +5378,7 @@ dependencies = [
|
|
|
5378
5378
|
"monostate",
|
|
5379
5379
|
"onig",
|
|
5380
5380
|
"paste",
|
|
5381
|
-
"rand 0.9.
|
|
5381
|
+
"rand 0.9.4",
|
|
5382
5382
|
"rayon",
|
|
5383
5383
|
"rayon-cond",
|
|
5384
5384
|
"regex",
|
|
@@ -712,6 +712,45 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
|
|
|
712
712
|
}
|
|
713
713
|
set_hash_entry(ruby, &hash, "processing_warnings", warnings_array.into_value_with(ruby))?;
|
|
714
714
|
|
|
715
|
+
// Convert LLM usage
|
|
716
|
+
if let Some(usages) = result.llm_usage {
|
|
717
|
+
let usage_array = ruby.ary_new();
|
|
718
|
+
for usage in usages {
|
|
719
|
+
let usage_hash = ruby.hash_new();
|
|
720
|
+
usage_hash.aset("model", usage.model.as_str())?;
|
|
721
|
+
usage_hash.aset("source", usage.source.as_str())?;
|
|
722
|
+
if let Some(input_tokens) = usage.input_tokens {
|
|
723
|
+
usage_hash.aset("input_tokens", input_tokens as i64)?;
|
|
724
|
+
} else {
|
|
725
|
+
usage_hash.aset("input_tokens", ruby.qnil().as_value())?;
|
|
726
|
+
}
|
|
727
|
+
if let Some(output_tokens) = usage.output_tokens {
|
|
728
|
+
usage_hash.aset("output_tokens", output_tokens as i64)?;
|
|
729
|
+
} else {
|
|
730
|
+
usage_hash.aset("output_tokens", ruby.qnil().as_value())?;
|
|
731
|
+
}
|
|
732
|
+
if let Some(total_tokens) = usage.total_tokens {
|
|
733
|
+
usage_hash.aset("total_tokens", total_tokens as i64)?;
|
|
734
|
+
} else {
|
|
735
|
+
usage_hash.aset("total_tokens", ruby.qnil().as_value())?;
|
|
736
|
+
}
|
|
737
|
+
if let Some(cost) = usage.estimated_cost {
|
|
738
|
+
usage_hash.aset("estimated_cost", ruby.float_from_f64(cost).into_value_with(ruby))?;
|
|
739
|
+
} else {
|
|
740
|
+
usage_hash.aset("estimated_cost", ruby.qnil().as_value())?;
|
|
741
|
+
}
|
|
742
|
+
if let Some(reason) = usage.finish_reason {
|
|
743
|
+
usage_hash.aset("finish_reason", reason.as_str())?;
|
|
744
|
+
} else {
|
|
745
|
+
usage_hash.aset("finish_reason", ruby.qnil().as_value())?;
|
|
746
|
+
}
|
|
747
|
+
usage_array.push(usage_hash)?;
|
|
748
|
+
}
|
|
749
|
+
set_hash_entry(ruby, &hash, "llm_usage", usage_array.into_value_with(ruby))?;
|
|
750
|
+
} else {
|
|
751
|
+
set_hash_entry(ruby, &hash, "llm_usage", ruby.qnil().as_value())?;
|
|
752
|
+
}
|
|
753
|
+
|
|
715
754
|
// Convert annotations
|
|
716
755
|
if let Some(annotations) = result.annotations {
|
|
717
756
|
let annotations_array = ruby.ary_new();
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -18,7 +18,8 @@ module Kreuzberg
|
|
|
18
18
|
type element_type = 'title' | 'narrative_text' | 'heading' | 'list_item' | 'table' | 'image' | 'page_break' | 'code_block' | 'block_quote' | 'footer' | 'header'
|
|
19
19
|
|
|
20
20
|
# Bounding box coordinates for element positioning (T::Struct from types.rb)
|
|
21
|
-
class BoundingBox
|
|
21
|
+
class BoundingBox
|
|
22
|
+
attr_reader x0: Float
|
|
22
23
|
attr_reader y0: Float
|
|
23
24
|
attr_reader x1: Float
|
|
24
25
|
attr_reader y1: Float
|
|
@@ -28,7 +29,8 @@ module Kreuzberg
|
|
|
28
29
|
end
|
|
29
30
|
|
|
30
31
|
# Metadata for a semantic element (T::Struct from types.rb)
|
|
31
|
-
class ElementMetadata
|
|
32
|
+
class ElementMetadata
|
|
33
|
+
attr_reader page_number: Integer?
|
|
32
34
|
attr_reader filename: String?
|
|
33
35
|
attr_reader coordinates: BoundingBox?
|
|
34
36
|
attr_reader element_index: Integer?
|
|
@@ -39,7 +41,8 @@ module Kreuzberg
|
|
|
39
41
|
end
|
|
40
42
|
|
|
41
43
|
# Semantic element extracted from document (T::Struct from types.rb)
|
|
42
|
-
class Element
|
|
44
|
+
class Element
|
|
45
|
+
attr_reader element_id: String
|
|
43
46
|
attr_reader element_type: String
|
|
44
47
|
attr_reader text: String
|
|
45
48
|
attr_reader metadata: ElementMetadata
|
|
@@ -49,7 +52,8 @@ module Kreuzberg
|
|
|
49
52
|
end
|
|
50
53
|
|
|
51
54
|
# Header/Heading metadata (T::Struct from types.rb)
|
|
52
|
-
class HeaderMetadata
|
|
55
|
+
class HeaderMetadata
|
|
56
|
+
attr_reader level: Integer
|
|
53
57
|
attr_reader text: String
|
|
54
58
|
attr_reader id: String?
|
|
55
59
|
attr_reader depth: Integer
|
|
@@ -60,7 +64,8 @@ module Kreuzberg
|
|
|
60
64
|
end
|
|
61
65
|
|
|
62
66
|
# Link metadata (T::Struct from types.rb)
|
|
63
|
-
class LinkMetadata
|
|
67
|
+
class LinkMetadata
|
|
68
|
+
attr_reader href: String
|
|
64
69
|
attr_reader text: String
|
|
65
70
|
attr_reader title: String?
|
|
66
71
|
attr_reader link_type: String
|
|
@@ -72,7 +77,8 @@ module Kreuzberg
|
|
|
72
77
|
end
|
|
73
78
|
|
|
74
79
|
# Image metadata (T::Struct from types.rb)
|
|
75
|
-
class ImageMetadata
|
|
80
|
+
class ImageMetadata
|
|
81
|
+
attr_reader src: String
|
|
76
82
|
attr_reader alt: String?
|
|
77
83
|
attr_reader title: String?
|
|
78
84
|
attr_reader dimensions: Array[Integer]?
|
|
@@ -84,7 +90,8 @@ module Kreuzberg
|
|
|
84
90
|
end
|
|
85
91
|
|
|
86
92
|
# Structured data metadata (T::Struct from types.rb)
|
|
87
|
-
class StructuredData
|
|
93
|
+
class StructuredData
|
|
94
|
+
attr_reader data_type: String
|
|
88
95
|
attr_reader raw_json: String
|
|
89
96
|
attr_reader schema_type: String?
|
|
90
97
|
|
|
@@ -210,7 +217,8 @@ module Kreuzberg
|
|
|
210
217
|
end
|
|
211
218
|
|
|
212
219
|
# HTML metadata (T::Struct from types.rb)
|
|
213
|
-
class HtmlMetadata
|
|
220
|
+
class HtmlMetadata
|
|
221
|
+
attr_reader title: String?
|
|
214
222
|
attr_reader description: String?
|
|
215
223
|
attr_reader author: String?
|
|
216
224
|
attr_reader copyright: String?
|
|
@@ -261,7 +269,8 @@ module Kreuzberg
|
|
|
261
269
|
end
|
|
262
270
|
|
|
263
271
|
# Extracted keyword with relevance metadata (T::Struct from types.rb)
|
|
264
|
-
class ExtractedKeyword
|
|
272
|
+
class ExtractedKeyword
|
|
273
|
+
attr_reader text: String
|
|
265
274
|
attr_reader score: Float
|
|
266
275
|
attr_reader algorithm: String
|
|
267
276
|
attr_reader positions: Array[Integer]?
|
|
@@ -271,15 +280,39 @@ module Kreuzberg
|
|
|
271
280
|
end
|
|
272
281
|
|
|
273
282
|
# Processing warning from a pipeline stage (T::Struct from types.rb)
|
|
274
|
-
class ProcessingWarning
|
|
283
|
+
class ProcessingWarning
|
|
284
|
+
attr_reader source: String
|
|
275
285
|
attr_reader message: String
|
|
276
286
|
|
|
277
287
|
def initialize: (source: String, message: String) -> void
|
|
278
288
|
def serialize: () -> Hash[Symbol, untyped]
|
|
279
289
|
end
|
|
280
290
|
|
|
291
|
+
# LLM token usage from an LLM-assisted extraction step (T::Struct from types.rb)
|
|
292
|
+
class LlmUsage
|
|
293
|
+
attr_reader model: String
|
|
294
|
+
attr_reader source: String
|
|
295
|
+
attr_reader input_tokens: Integer?
|
|
296
|
+
attr_reader output_tokens: Integer?
|
|
297
|
+
attr_reader total_tokens: Integer?
|
|
298
|
+
attr_reader estimated_cost: Float?
|
|
299
|
+
attr_reader finish_reason: String?
|
|
300
|
+
|
|
301
|
+
def initialize: (
|
|
302
|
+
model: String,
|
|
303
|
+
source: String,
|
|
304
|
+
?input_tokens: Integer?,
|
|
305
|
+
?output_tokens: Integer?,
|
|
306
|
+
?total_tokens: Integer?,
|
|
307
|
+
?estimated_cost: Float?,
|
|
308
|
+
?finish_reason: String?
|
|
309
|
+
) -> void
|
|
310
|
+
def serialize: () -> Hash[Symbol, untyped]
|
|
311
|
+
end
|
|
312
|
+
|
|
281
313
|
# Bounding box for document node positioning (T::Struct from types.rb)
|
|
282
|
-
class DocumentBoundingBox
|
|
314
|
+
class DocumentBoundingBox
|
|
315
|
+
attr_reader x0: Float
|
|
283
316
|
attr_reader y0: Float
|
|
284
317
|
attr_reader x1: Float
|
|
285
318
|
attr_reader y1: Float
|
|
@@ -289,7 +322,8 @@ module Kreuzberg
|
|
|
289
322
|
end
|
|
290
323
|
|
|
291
324
|
# Annotation for a document node (T::Struct from types.rb)
|
|
292
|
-
class DocumentAnnotation
|
|
325
|
+
class DocumentAnnotation
|
|
326
|
+
attr_reader key: String
|
|
293
327
|
attr_reader value: String
|
|
294
328
|
|
|
295
329
|
def initialize: (key: String, value: String) -> void
|
|
@@ -297,7 +331,8 @@ module Kreuzberg
|
|
|
297
331
|
end
|
|
298
332
|
|
|
299
333
|
# Single node in the document structure tree (T::Struct from types.rb)
|
|
300
|
-
class DocumentNode
|
|
334
|
+
class DocumentNode
|
|
335
|
+
attr_reader id: String
|
|
301
336
|
attr_reader content: String
|
|
302
337
|
attr_reader parent: Integer?
|
|
303
338
|
attr_reader children: Array[Integer]
|
|
@@ -322,7 +357,8 @@ module Kreuzberg
|
|
|
322
357
|
end
|
|
323
358
|
|
|
324
359
|
# Structured document representation (T::Struct from types.rb)
|
|
325
|
-
class DocumentStructure
|
|
360
|
+
class DocumentStructure
|
|
361
|
+
attr_reader nodes: Array[DocumentNode]
|
|
326
362
|
|
|
327
363
|
def initialize: (nodes: Array[DocumentNode]) -> void
|
|
328
364
|
def serialize: () -> Hash[Symbol, untyped]
|
|
@@ -927,7 +963,10 @@ module Kreuzberg
|
|
|
927
963
|
extracted_keywords: Array[extracted_keyword_hash]?,
|
|
928
964
|
quality_score: Float?,
|
|
929
965
|
processing_warnings: Array[processing_warning_hash]?,
|
|
930
|
-
annotations: Array[pdf_annotation_hash]
|
|
966
|
+
annotations: Array[pdf_annotation_hash]?,
|
|
967
|
+
uris: Array[uri_hash]?,
|
|
968
|
+
children: Array[archive_entry_hash]?,
|
|
969
|
+
llm_usage: Array[llm_usage_hash]?
|
|
931
970
|
}
|
|
932
971
|
|
|
933
972
|
type extracted_keyword_hash = {
|
|
@@ -942,6 +981,29 @@ module Kreuzberg
|
|
|
942
981
|
message: String
|
|
943
982
|
}
|
|
944
983
|
|
|
984
|
+
type llm_usage_hash = {
|
|
985
|
+
model: String,
|
|
986
|
+
source: String,
|
|
987
|
+
input_tokens: Integer?,
|
|
988
|
+
output_tokens: Integer?,
|
|
989
|
+
total_tokens: Integer?,
|
|
990
|
+
estimated_cost: Float?,
|
|
991
|
+
finish_reason: String?
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
type uri_hash = {
|
|
995
|
+
url: String,
|
|
996
|
+
label: String?,
|
|
997
|
+
page: Integer?,
|
|
998
|
+
kind: String
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
type archive_entry_hash = {
|
|
1002
|
+
path: String,
|
|
1003
|
+
mime_type: String,
|
|
1004
|
+
result: extraction_result_hash?
|
|
1005
|
+
}
|
|
1006
|
+
|
|
945
1007
|
type page_content_hash = {
|
|
946
1008
|
page_number: Integer,
|
|
947
1009
|
content: String,
|
|
@@ -1483,6 +1545,9 @@ module Kreuzberg
|
|
|
1483
1545
|
attr_reader quality_score: Float?
|
|
1484
1546
|
attr_reader processing_warnings: Array[ProcessingWarning]?
|
|
1485
1547
|
attr_reader annotations: Array[PdfAnnotation]?
|
|
1548
|
+
attr_reader uris: Array[uri_hash]?
|
|
1549
|
+
attr_reader children: Array[archive_entry_hash]?
|
|
1550
|
+
attr_reader llm_usage: Array[LlmUsage]?
|
|
1486
1551
|
|
|
1487
1552
|
# PDF annotation extracted from a document page (Struct from result.rb)
|
|
1488
1553
|
class PdfAnnotation
|
|
@@ -1521,6 +1586,11 @@ module Kreuzberg
|
|
|
1521
1586
|
def parse_document_structure: (Hash[String, untyped]? document_data) -> DocumentStructure?
|
|
1522
1587
|
def parse_extracted_keywords: (Array[extracted_keyword_hash]? keywords_data) -> Array[ExtractedKeyword]?
|
|
1523
1588
|
def parse_processing_warnings: (Array[processing_warning_hash]? warnings_data) -> Array[ProcessingWarning]
|
|
1589
|
+
def parse_uris: (Array[uri_hash]? uris_data) -> Array[uri_hash]?
|
|
1590
|
+
def build_uri: (Hash[String, untyped] u_hash) -> uri_hash
|
|
1591
|
+
def parse_children: (Array[untyped]? children_data) -> Array[archive_entry_hash]?
|
|
1592
|
+
def build_archive_entry: (Hash[String, untyped] c_hash) -> archive_entry_hash
|
|
1593
|
+
def parse_llm_usage: (Array[llm_usage_hash]? usage_data) -> Array[LlmUsage]?
|
|
1524
1594
|
def get_value: (Hash[String | Symbol, untyped] hash, String key, ?untyped default) -> untyped
|
|
1525
1595
|
def serialize_tables: () -> Array[table_hash]
|
|
1526
1596
|
def serialize_chunks: () -> Array[chunk_hash]?
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.8.
|
|
5
|
+
version = "4.8.5"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
|
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.8.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.8.5", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.5" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.185"
|
|
39
39
|
liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
|
|
@@ -47,7 +47,7 @@ ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features
|
|
|
47
47
|
parking_lot = "0.12.5"
|
|
48
48
|
pdf_oxide = { version = "0.3.30", default-features = false }
|
|
49
49
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
50
|
-
rayon = "1.
|
|
50
|
+
rayon = "1.12.0"
|
|
51
51
|
reqwest = { version = "0.13.2", default-features = false }
|
|
52
52
|
serde = { version = "1.0.228", features = ["derive"] }
|
|
53
53
|
serde_json = { version = "1.0.149" }
|
|
@@ -57,7 +57,7 @@ thiserror = "2.0.18"
|
|
|
57
57
|
tokio = { version = "1.51.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
58
58
|
toml = "1.1.2"
|
|
59
59
|
tracing = "0.1"
|
|
60
|
-
tree-sitter-language-pack = { version = "1.
|
|
60
|
+
tree-sitter-language-pack = { version = "1.6.0", features = ["serde"], default-features = false }
|
|
61
61
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
|
62
62
|
wasm-bindgen-futures = "0.4"
|
|
63
63
|
web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.8.
|
|
3
|
+
version = "4.8.5"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -319,7 +319,7 @@ pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfi
|
|
|
319
319
|
pulldown-cmark = { version = "0.13" }
|
|
320
320
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
|
321
321
|
rake = { version = "0.3.6", optional = true }
|
|
322
|
-
rayon = "1.
|
|
322
|
+
rayon = "1.12.0"
|
|
323
323
|
regex = "1.12.3"
|
|
324
324
|
rmcp = { version = "1.4.0", features = [
|
|
325
325
|
"server",
|
|
@@ -392,7 +392,7 @@ optional = true
|
|
|
392
392
|
# Override getrandom to enable js feature for WASM targets
|
|
393
393
|
# This is needed because ring/rustls (via ureq) depend on getrandom without js feature
|
|
394
394
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
395
|
-
tree-sitter-language-pack = { version = "1.
|
|
395
|
+
tree-sitter-language-pack = { version = "1.6.0", features = ["serde"], default-features = false, optional = true }
|
|
396
396
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
|
397
397
|
|
|
398
398
|
[build-dependencies]
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.8.
|
|
21
|
+
> **🚀 Version 4.8.5 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -680,7 +680,7 @@ pub async fn extract_structured_handler(
|
|
|
680
680
|
};
|
|
681
681
|
|
|
682
682
|
// Run structured extraction on the extracted content
|
|
683
|
-
let structured_output = crate::llm::structured::extract_structured(&result.content, &structured_config)
|
|
683
|
+
let (structured_output, _usage) = crate::llm::structured::extract_structured(&result.content, &structured_config)
|
|
684
684
|
.await
|
|
685
685
|
.map_err(ApiError::internal)?;
|
|
686
686
|
|