kreuzberg 4.8.4 → 4.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +6 -3
  3. data/ext/kreuzberg_rb/native/Cargo.lock +26 -26
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
  6. data/ext/kreuzberg_rb/native/src/result.rs +39 -0
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/sig/kreuzberg.rbs +85 -15
  9. data/vendor/Cargo.toml +5 -5
  10. data/vendor/kreuzberg/Cargo.toml +3 -3
  11. data/vendor/kreuzberg/README.md +1 -1
  12. data/vendor/kreuzberg/src/api/handlers.rs +1 -1
  13. data/vendor/kreuzberg/src/chunking/core.rs +114 -9
  14. data/vendor/kreuzberg/src/core/pipeline/mod.rs +4 -1
  15. data/vendor/kreuzberg/src/embeddings/mod.rs +4 -1
  16. data/vendor/kreuzberg/src/llm/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/llm/structured.rs +7 -3
  18. data/vendor/kreuzberg/src/llm/usage.rs +40 -0
  19. data/vendor/kreuzberg/src/llm/vlm_embeddings.rs +5 -3
  20. data/vendor/kreuzberg/src/llm/vlm_ocr.rs +6 -3
  21. data/vendor/kreuzberg/src/mcp/server.rs +1 -1
  22. data/vendor/kreuzberg/src/types/extraction.rs +41 -0
  23. data/vendor/kreuzberg/tests/llm_integration.rs +7 -7
  24. data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
  25. data/vendor/kreuzberg-ffi/kreuzberg.h +63 -13
  26. data/vendor/kreuzberg-ffi/src/helpers.rs +13 -0
  27. data/vendor/kreuzberg-ffi/src/lib.rs +6 -3
  28. data/vendor/kreuzberg-ffi/src/memory.rs +30 -11
  29. data/vendor/kreuzberg-ffi/src/result.rs +71 -0
  30. data/vendor/kreuzberg-ffi/src/types.rs +19 -16
  31. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  32. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  33. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  34. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 16deeaa47cb35ded0b844af72d43b74da5539084f21a79d17513be2da9ac2f0b
4
- data.tar.gz: 64715b14cffac78a796853e9f5d9a2d0969427de9d59a243c87a5d20699dcce3
3
+ metadata.gz: 198e3a74bbf86420f51a7d5b8f80e5e110cc93f2e144fadfe129f8f90155fe40
4
+ data.tar.gz: e54cfaf29ae9b910f5cc365a8095298e312ef93098c185dafa497b80a5f5859b
5
5
  SHA512:
6
- metadata.gz: e362717e5db0fad6a9494737e53c2444a84cb76fd274c70283a6650eef0891e9ced2af424b2ed9501eb749f21fcfb2ca3b4f8c7b336d1a248bb99f4a7e69131e
7
- data.tar.gz: 5d05d862a170f0efe0f6f6a9867846bb3b000136f638b1efe6ddee5e94310dc92495a40a7dff204b4098f92989175e1691ba10897be884ca960477d48dcbc6ca
6
+ metadata.gz: e5d6e2d8a21b12d6984bd4fc3d4cdc7858b46d1d21e09e238c832da830f8329af93f33ac6997590a342b52f68c7448aa68d68b8a323397116682e1373be28d38
7
+ data.tar.gz: d5a7c6df3f4a90f76e8554fffcb3cba45e274521f2efc96ecf4c882228b8398ed4ec0cb3322ccf1090b615b3ecae5b19b8f947e779fb3bf1a72c6e101d4184f1
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.4" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.5" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -39,10 +39,13 @@
39
39
  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
40
40
  <img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
41
41
  </a>
42
+ <a href="https://artifacthub.io/packages/search?repo=kreuzberg">
43
+ <img src="https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/kreuzberg" alt="Artifact Hub">
44
+ </a>
42
45
 
43
46
  <!-- Project Info -->
44
47
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
45
- <img src="https://img.shields.io/badge/License-Elastic--2.0-blue.svg" alt="License">
48
+ <img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
46
49
  </a>
47
50
  <a href="https://docs.kreuzberg.dev">
48
51
  <img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
@@ -419,7 +422,7 @@ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg
419
422
 
420
423
  ## License
421
424
 
422
- Elastic License 2.0 (ELv2) - see [LICENSE](../../LICENSE) for details.
425
+ MIT License - see LICENSE file for details.
423
426
 
424
427
  ## Support
425
428
 
@@ -326,9 +326,9 @@ dependencies = [
326
326
 
327
327
  [[package]]
328
328
  name = "axum"
329
- version = "0.8.8"
329
+ version = "0.8.9"
330
330
  source = "registry+https://github.com/rust-lang/crates.io-index"
331
- checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8"
331
+ checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90"
332
332
  dependencies = [
333
333
  "axum-core",
334
334
  "axum-macros",
@@ -380,9 +380,9 @@ dependencies = [
380
380
 
381
381
  [[package]]
382
382
  name = "axum-macros"
383
- version = "0.5.0"
383
+ version = "0.5.1"
384
384
  source = "registry+https://github.com/rust-lang/crates.io-index"
385
- checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c"
385
+ checksum = "7aa268c23bfbbd2c4363b9cd302a4f504fb2a9dfe7e3451d66f35dd392e20aca"
386
386
  dependencies = [
387
387
  "proc-macro2",
388
388
  "quote",
@@ -766,7 +766,7 @@ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
766
766
  dependencies = [
767
767
  "cfg-if",
768
768
  "cpufeatures 0.3.0",
769
- "rand_core 0.10.0",
769
+ "rand_core 0.10.1",
770
770
  ]
771
771
 
772
772
  [[package]]
@@ -1858,7 +1858,7 @@ dependencies = [
1858
1858
  "js-sys",
1859
1859
  "libc",
1860
1860
  "r-efi 6.0.0",
1861
- "rand_core 0.10.0",
1861
+ "rand_core 0.10.1",
1862
1862
  "wasip2",
1863
1863
  "wasip3",
1864
1864
  "wasm-bindgen",
@@ -2098,7 +2098,7 @@ dependencies = [
2098
2098
  "indicatif 0.17.11",
2099
2099
  "libc",
2100
2100
  "log",
2101
- "rand 0.9.3",
2101
+ "rand 0.9.4",
2102
2102
  "serde",
2103
2103
  "serde_json",
2104
2104
  "thiserror 2.0.18",
@@ -2117,7 +2117,7 @@ dependencies = [
2117
2117
  "indicatif 0.18.4",
2118
2118
  "libc",
2119
2119
  "log",
2120
- "rand 0.9.3",
2120
+ "rand 0.9.4",
2121
2121
  "serde",
2122
2122
  "serde_json",
2123
2123
  "thiserror 2.0.18",
@@ -2529,7 +2529,7 @@ dependencies = [
2529
2529
  "itertools 0.14.0",
2530
2530
  "nalgebra",
2531
2531
  "num",
2532
- "rand 0.9.3",
2532
+ "rand 0.9.4",
2533
2533
  "rand_distr",
2534
2534
  ]
2535
2535
 
@@ -2931,7 +2931,7 @@ dependencies = [
2931
2931
 
2932
2932
  [[package]]
2933
2933
  name = "kreuzberg-rb"
2934
- version = "4.8.4"
2934
+ version = "4.8.5"
2935
2935
  dependencies = [
2936
2936
  "async-trait",
2937
2937
  "html-to-markdown-rs",
@@ -3138,9 +3138,9 @@ dependencies = [
3138
3138
 
3139
3139
  [[package]]
3140
3140
  name = "lru"
3141
- version = "0.16.3"
3141
+ version = "0.16.4"
3142
3142
  source = "registry+https://github.com/rust-lang/crates.io-index"
3143
- checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593"
3143
+ checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
3144
3144
  dependencies = [
3145
3145
  "hashbrown 0.16.1",
3146
3146
  ]
@@ -3707,7 +3707,7 @@ dependencies = [
3707
3707
  "futures-util",
3708
3708
  "opentelemetry",
3709
3709
  "percent-encoding",
3710
- "rand 0.9.3",
3710
+ "rand 0.9.4",
3711
3711
  "thiserror 2.0.18",
3712
3712
  "tokio",
3713
3713
  "tokio-stream",
@@ -4074,7 +4074,7 @@ dependencies = [
4074
4074
  "bytes",
4075
4075
  "getrandom 0.3.4",
4076
4076
  "lru-slab",
4077
- "rand 0.9.3",
4077
+ "rand 0.9.4",
4078
4078
  "ring",
4079
4079
  "rustc-hash",
4080
4080
  "rustls",
@@ -4140,9 +4140,9 @@ dependencies = [
4140
4140
 
4141
4141
  [[package]]
4142
4142
  name = "rand"
4143
- version = "0.9.3"
4143
+ version = "0.9.4"
4144
4144
  source = "registry+https://github.com/rust-lang/crates.io-index"
4145
- checksum = "7ec095654a25171c2124e9e3393a930bddbffdc939556c914957a4c3e0a87166"
4145
+ checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
4146
4146
  dependencies = [
4147
4147
  "rand_chacha",
4148
4148
  "rand_core 0.9.5",
@@ -4156,7 +4156,7 @@ checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
4156
4156
  dependencies = [
4157
4157
  "chacha20",
4158
4158
  "getrandom 0.4.2",
4159
- "rand_core 0.10.0",
4159
+ "rand_core 0.10.1",
4160
4160
  ]
4161
4161
 
4162
4162
  [[package]]
@@ -4180,9 +4180,9 @@ dependencies = [
4180
4180
 
4181
4181
  [[package]]
4182
4182
  name = "rand_core"
4183
- version = "0.10.0"
4183
+ version = "0.10.1"
4184
4184
  source = "registry+https://github.com/rust-lang/crates.io-index"
4185
- checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"
4185
+ checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69"
4186
4186
 
4187
4187
  [[package]]
4188
4188
  name = "rand_distr"
@@ -4191,7 +4191,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
4191
4191
  checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
4192
4192
  dependencies = [
4193
4193
  "num-traits",
4194
- "rand 0.9.3",
4194
+ "rand 0.9.4",
4195
4195
  ]
4196
4196
 
4197
4197
  [[package]]
@@ -4227,7 +4227,7 @@ dependencies = [
4227
4227
  "num-traits",
4228
4228
  "paste",
4229
4229
  "profiling",
4230
- "rand 0.9.3",
4230
+ "rand 0.9.4",
4231
4231
  "rand_chacha",
4232
4232
  "simd_helpers",
4233
4233
  "thiserror 2.0.18",
@@ -4258,9 +4258,9 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
4258
4258
 
4259
4259
  [[package]]
4260
4260
  name = "rayon"
4261
- version = "1.11.0"
4261
+ version = "1.12.0"
4262
4262
  source = "registry+https://github.com/rust-lang/crates.io-index"
4263
- checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
4263
+ checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
4264
4264
  dependencies = [
4265
4265
  "either",
4266
4266
  "rayon-core",
@@ -4625,9 +4625,9 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
4625
4625
 
4626
4626
  [[package]]
4627
4627
  name = "rustls-webpki"
4628
- version = "0.103.11"
4628
+ version = "0.103.12"
4629
4629
  source = "registry+https://github.com/rust-lang/crates.io-index"
4630
- checksum = "20a6af516fea4b20eccceaf166e8aa666ac996208e8a644ce3ef5aa783bc7cd4"
4630
+ checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06"
4631
4631
  dependencies = [
4632
4632
  "aws-lc-rs",
4633
4633
  "ring",
@@ -5378,7 +5378,7 @@ dependencies = [
5378
5378
  "monostate",
5379
5379
  "onig",
5380
5380
  "paste",
5381
- "rand 0.9.3",
5381
+ "rand 0.9.4",
5382
5382
  "rayon",
5383
5383
  "rayon-cond",
5384
5384
  "regex",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.8.4"
3
+ version = "4.8.5"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -111,6 +111,7 @@ impl OcrBackend for RubyOcrBackend {
111
111
  uris: None,
112
112
  ocr_internal_document: None,
113
113
  structured_output: None,
114
+ llm_usage: None,
114
115
  })
115
116
  })
116
117
  }
@@ -712,6 +712,45 @@ pub fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> R
712
712
  }
713
713
  set_hash_entry(ruby, &hash, "processing_warnings", warnings_array.into_value_with(ruby))?;
714
714
 
715
+ // Convert LLM usage
716
+ if let Some(usages) = result.llm_usage {
717
+ let usage_array = ruby.ary_new();
718
+ for usage in usages {
719
+ let usage_hash = ruby.hash_new();
720
+ usage_hash.aset("model", usage.model.as_str())?;
721
+ usage_hash.aset("source", usage.source.as_str())?;
722
+ if let Some(input_tokens) = usage.input_tokens {
723
+ usage_hash.aset("input_tokens", input_tokens as i64)?;
724
+ } else {
725
+ usage_hash.aset("input_tokens", ruby.qnil().as_value())?;
726
+ }
727
+ if let Some(output_tokens) = usage.output_tokens {
728
+ usage_hash.aset("output_tokens", output_tokens as i64)?;
729
+ } else {
730
+ usage_hash.aset("output_tokens", ruby.qnil().as_value())?;
731
+ }
732
+ if let Some(total_tokens) = usage.total_tokens {
733
+ usage_hash.aset("total_tokens", total_tokens as i64)?;
734
+ } else {
735
+ usage_hash.aset("total_tokens", ruby.qnil().as_value())?;
736
+ }
737
+ if let Some(cost) = usage.estimated_cost {
738
+ usage_hash.aset("estimated_cost", ruby.float_from_f64(cost).into_value_with(ruby))?;
739
+ } else {
740
+ usage_hash.aset("estimated_cost", ruby.qnil().as_value())?;
741
+ }
742
+ if let Some(reason) = usage.finish_reason {
743
+ usage_hash.aset("finish_reason", reason.as_str())?;
744
+ } else {
745
+ usage_hash.aset("finish_reason", ruby.qnil().as_value())?;
746
+ }
747
+ usage_array.push(usage_hash)?;
748
+ }
749
+ set_hash_entry(ruby, &hash, "llm_usage", usage_array.into_value_with(ruby))?;
750
+ } else {
751
+ set_hash_entry(ruby, &hash, "llm_usage", ruby.qnil().as_value())?;
752
+ }
753
+
715
754
  // Convert annotations
716
755
  if let Some(annotations) = result.annotations {
717
756
  let annotations_array = ruby.ary_new();
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.8.4'
4
+ VERSION = '4.8.5'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -18,7 +18,8 @@ module Kreuzberg
18
18
  type element_type = 'title' | 'narrative_text' | 'heading' | 'list_item' | 'table' | 'image' | 'page_break' | 'code_block' | 'block_quote' | 'footer' | 'header'
19
19
 
20
20
  # Bounding box coordinates for element positioning (T::Struct from types.rb)
21
- class BoundingBox attr_reader x0: Float
21
+ class BoundingBox
22
+ attr_reader x0: Float
22
23
  attr_reader y0: Float
23
24
  attr_reader x1: Float
24
25
  attr_reader y1: Float
@@ -28,7 +29,8 @@ module Kreuzberg
28
29
  end
29
30
 
30
31
  # Metadata for a semantic element (T::Struct from types.rb)
31
- class ElementMetadata attr_reader page_number: Integer?
32
+ class ElementMetadata
33
+ attr_reader page_number: Integer?
32
34
  attr_reader filename: String?
33
35
  attr_reader coordinates: BoundingBox?
34
36
  attr_reader element_index: Integer?
@@ -39,7 +41,8 @@ module Kreuzberg
39
41
  end
40
42
 
41
43
  # Semantic element extracted from document (T::Struct from types.rb)
42
- class Element attr_reader element_id: String
44
+ class Element
45
+ attr_reader element_id: String
43
46
  attr_reader element_type: String
44
47
  attr_reader text: String
45
48
  attr_reader metadata: ElementMetadata
@@ -49,7 +52,8 @@ module Kreuzberg
49
52
  end
50
53
 
51
54
  # Header/Heading metadata (T::Struct from types.rb)
52
- class HeaderMetadata attr_reader level: Integer
55
+ class HeaderMetadata
56
+ attr_reader level: Integer
53
57
  attr_reader text: String
54
58
  attr_reader id: String?
55
59
  attr_reader depth: Integer
@@ -60,7 +64,8 @@ module Kreuzberg
60
64
  end
61
65
 
62
66
  # Link metadata (T::Struct from types.rb)
63
- class LinkMetadata attr_reader href: String
67
+ class LinkMetadata
68
+ attr_reader href: String
64
69
  attr_reader text: String
65
70
  attr_reader title: String?
66
71
  attr_reader link_type: String
@@ -72,7 +77,8 @@ module Kreuzberg
72
77
  end
73
78
 
74
79
  # Image metadata (T::Struct from types.rb)
75
- class ImageMetadata attr_reader src: String
80
+ class ImageMetadata
81
+ attr_reader src: String
76
82
  attr_reader alt: String?
77
83
  attr_reader title: String?
78
84
  attr_reader dimensions: Array[Integer]?
@@ -84,7 +90,8 @@ module Kreuzberg
84
90
  end
85
91
 
86
92
  # Structured data metadata (T::Struct from types.rb)
87
- class StructuredData attr_reader data_type: String
93
+ class StructuredData
94
+ attr_reader data_type: String
88
95
  attr_reader raw_json: String
89
96
  attr_reader schema_type: String?
90
97
 
@@ -210,7 +217,8 @@ module Kreuzberg
210
217
  end
211
218
 
212
219
  # HTML metadata (T::Struct from types.rb)
213
- class HtmlMetadata attr_reader title: String?
220
+ class HtmlMetadata
221
+ attr_reader title: String?
214
222
  attr_reader description: String?
215
223
  attr_reader author: String?
216
224
  attr_reader copyright: String?
@@ -261,7 +269,8 @@ module Kreuzberg
261
269
  end
262
270
 
263
271
  # Extracted keyword with relevance metadata (T::Struct from types.rb)
264
- class ExtractedKeyword attr_reader text: String
272
+ class ExtractedKeyword
273
+ attr_reader text: String
265
274
  attr_reader score: Float
266
275
  attr_reader algorithm: String
267
276
  attr_reader positions: Array[Integer]?
@@ -271,15 +280,39 @@ module Kreuzberg
271
280
  end
272
281
 
273
282
  # Processing warning from a pipeline stage (T::Struct from types.rb)
274
- class ProcessingWarning attr_reader source: String
283
+ class ProcessingWarning
284
+ attr_reader source: String
275
285
  attr_reader message: String
276
286
 
277
287
  def initialize: (source: String, message: String) -> void
278
288
  def serialize: () -> Hash[Symbol, untyped]
279
289
  end
280
290
 
291
+ # LLM token usage from an LLM-assisted extraction step (T::Struct from types.rb)
292
+ class LlmUsage
293
+ attr_reader model: String
294
+ attr_reader source: String
295
+ attr_reader input_tokens: Integer?
296
+ attr_reader output_tokens: Integer?
297
+ attr_reader total_tokens: Integer?
298
+ attr_reader estimated_cost: Float?
299
+ attr_reader finish_reason: String?
300
+
301
+ def initialize: (
302
+ model: String,
303
+ source: String,
304
+ ?input_tokens: Integer?,
305
+ ?output_tokens: Integer?,
306
+ ?total_tokens: Integer?,
307
+ ?estimated_cost: Float?,
308
+ ?finish_reason: String?
309
+ ) -> void
310
+ def serialize: () -> Hash[Symbol, untyped]
311
+ end
312
+
281
313
  # Bounding box for document node positioning (T::Struct from types.rb)
282
- class DocumentBoundingBox attr_reader x0: Float
314
+ class DocumentBoundingBox
315
+ attr_reader x0: Float
283
316
  attr_reader y0: Float
284
317
  attr_reader x1: Float
285
318
  attr_reader y1: Float
@@ -289,7 +322,8 @@ module Kreuzberg
289
322
  end
290
323
 
291
324
  # Annotation for a document node (T::Struct from types.rb)
292
- class DocumentAnnotation attr_reader key: String
325
+ class DocumentAnnotation
326
+ attr_reader key: String
293
327
  attr_reader value: String
294
328
 
295
329
  def initialize: (key: String, value: String) -> void
@@ -297,7 +331,8 @@ module Kreuzberg
297
331
  end
298
332
 
299
333
  # Single node in the document structure tree (T::Struct from types.rb)
300
- class DocumentNode attr_reader id: String
334
+ class DocumentNode
335
+ attr_reader id: String
301
336
  attr_reader content: String
302
337
  attr_reader parent: Integer?
303
338
  attr_reader children: Array[Integer]
@@ -322,7 +357,8 @@ module Kreuzberg
322
357
  end
323
358
 
324
359
  # Structured document representation (T::Struct from types.rb)
325
- class DocumentStructure attr_reader nodes: Array[DocumentNode]
360
+ class DocumentStructure
361
+ attr_reader nodes: Array[DocumentNode]
326
362
 
327
363
  def initialize: (nodes: Array[DocumentNode]) -> void
328
364
  def serialize: () -> Hash[Symbol, untyped]
@@ -927,7 +963,10 @@ module Kreuzberg
927
963
  extracted_keywords: Array[extracted_keyword_hash]?,
928
964
  quality_score: Float?,
929
965
  processing_warnings: Array[processing_warning_hash]?,
930
- annotations: Array[pdf_annotation_hash]?
966
+ annotations: Array[pdf_annotation_hash]?,
967
+ uris: Array[uri_hash]?,
968
+ children: Array[archive_entry_hash]?,
969
+ llm_usage: Array[llm_usage_hash]?
931
970
  }
932
971
 
933
972
  type extracted_keyword_hash = {
@@ -942,6 +981,29 @@ module Kreuzberg
942
981
  message: String
943
982
  }
944
983
 
984
+ type llm_usage_hash = {
985
+ model: String,
986
+ source: String,
987
+ input_tokens: Integer?,
988
+ output_tokens: Integer?,
989
+ total_tokens: Integer?,
990
+ estimated_cost: Float?,
991
+ finish_reason: String?
992
+ }
993
+
994
+ type uri_hash = {
995
+ url: String,
996
+ label: String?,
997
+ page: Integer?,
998
+ kind: String
999
+ }
1000
+
1001
+ type archive_entry_hash = {
1002
+ path: String,
1003
+ mime_type: String,
1004
+ result: extraction_result_hash?
1005
+ }
1006
+
945
1007
  type page_content_hash = {
946
1008
  page_number: Integer,
947
1009
  content: String,
@@ -1483,6 +1545,9 @@ module Kreuzberg
1483
1545
  attr_reader quality_score: Float?
1484
1546
  attr_reader processing_warnings: Array[ProcessingWarning]?
1485
1547
  attr_reader annotations: Array[PdfAnnotation]?
1548
+ attr_reader uris: Array[uri_hash]?
1549
+ attr_reader children: Array[archive_entry_hash]?
1550
+ attr_reader llm_usage: Array[LlmUsage]?
1486
1551
 
1487
1552
  # PDF annotation extracted from a document page (Struct from result.rb)
1488
1553
  class PdfAnnotation
@@ -1521,6 +1586,11 @@ module Kreuzberg
1521
1586
  def parse_document_structure: (Hash[String, untyped]? document_data) -> DocumentStructure?
1522
1587
  def parse_extracted_keywords: (Array[extracted_keyword_hash]? keywords_data) -> Array[ExtractedKeyword]?
1523
1588
  def parse_processing_warnings: (Array[processing_warning_hash]? warnings_data) -> Array[ProcessingWarning]
1589
+ def parse_uris: (Array[uri_hash]? uris_data) -> Array[uri_hash]?
1590
+ def build_uri: (Hash[String, untyped] u_hash) -> uri_hash
1591
+ def parse_children: (Array[untyped]? children_data) -> Array[archive_entry_hash]?
1592
+ def build_archive_entry: (Hash[String, untyped] c_hash) -> archive_entry_hash
1593
+ def parse_llm_usage: (Array[llm_usage_hash]? usage_data) -> Array[LlmUsage]?
1524
1594
  def get_value: (Hash[String | Symbol, untyped] hash, String key, ?untyped default) -> untyped
1525
1595
  def serialize_tables: () -> Array[table_hash]
1526
1596
  def serialize_chunks: () -> Array[chunk_hash]?
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.8.4"
5
+ version = "4.8.5"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.8.4", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.4" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.8.5", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.5" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.185"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -47,7 +47,7 @@ ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features
47
47
  parking_lot = "0.12.5"
48
48
  pdf_oxide = { version = "0.3.30", default-features = false }
49
49
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
- rayon = "1.11.0"
50
+ rayon = "1.12.0"
51
51
  reqwest = { version = "0.13.2", default-features = false }
52
52
  serde = { version = "1.0.228", features = ["derive"] }
53
53
  serde_json = { version = "1.0.149" }
@@ -57,7 +57,7 @@ thiserror = "2.0.18"
57
57
  tokio = { version = "1.51.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
58
58
  toml = "1.1.2"
59
59
  tracing = "0.1"
60
- tree-sitter-language-pack = { version = "1.4.2", features = ["serde"], default-features = false }
60
+ tree-sitter-language-pack = { version = "1.6.0", features = ["serde"], default-features = false }
61
61
  wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
62
62
  wasm-bindgen-futures = "0.4"
63
63
  web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.8.4"
3
+ version = "4.8.5"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -319,7 +319,7 @@ pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfi
319
319
  pulldown-cmark = { version = "0.13" }
320
320
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
321
321
  rake = { version = "0.3.6", optional = true }
322
- rayon = "1.11.0"
322
+ rayon = "1.12.0"
323
323
  regex = "1.12.3"
324
324
  rmcp = { version = "1.4.0", features = [
325
325
  "server",
@@ -392,7 +392,7 @@ optional = true
392
392
  # Override getrandom to enable js feature for WASM targets
393
393
  # This is needed because ring/rustls (via ureq) depend on getrandom without js feature
394
394
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
395
- tree-sitter-language-pack = { version = "1.4.2", features = ["wasm", "serde"], default-features = false, optional = true }
395
+ tree-sitter-language-pack = { version = "1.6.0", features = ["serde"], default-features = false, optional = true }
396
396
  wasm-bindgen-rayon = { version = "1.3", optional = true }
397
397
 
398
398
  [build-dependencies]
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.8.4 Release**
21
+ > **🚀 Version 4.8.5 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -680,7 +680,7 @@ pub async fn extract_structured_handler(
680
680
  };
681
681
 
682
682
  // Run structured extraction on the extracted content
683
- let structured_output = crate::llm::structured::extract_structured(&result.content, &structured_config)
683
+ let (structured_output, _usage) = crate::llm::structured::extract_structured(&result.content, &structured_config)
684
684
  .await
685
685
  .map_err(ApiError::internal)?;
686
686