kreuzberg 4.7.4 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3 -3
  3. data/ext/kreuzberg_rb/native/Cargo.lock +81 -12
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  5. data/ext/kreuzberg_rb/native/src/config/types.rs +46 -0
  6. data/ext/kreuzberg_rb/native/src/embedding.rs +87 -0
  7. data/ext/kreuzberg_rb/native/src/error_handling.rs +7 -0
  8. data/ext/kreuzberg_rb/native/src/lib.rs +5 -0
  9. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
  10. data/lib/kreuzberg/config.rb +43 -5
  11. data/lib/kreuzberg/errors.rb +4 -0
  12. data/lib/kreuzberg/extraction_api.rb +35 -0
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +7 -0
  15. data/sig/kreuzberg.rbs +20 -0
  16. data/vendor/Cargo.toml +5 -3
  17. data/vendor/kreuzberg/Cargo.toml +9 -1
  18. data/vendor/kreuzberg/README.md +3 -3
  19. data/vendor/kreuzberg/src/api/error.rs +1 -0
  20. data/vendor/kreuzberg/src/api/handlers.rs +231 -39
  21. data/vendor/kreuzberg/src/api/mod.rs +2 -1
  22. data/vendor/kreuzberg/src/api/openapi.rs +2 -0
  23. data/vendor/kreuzberg/src/api/router.rs +3 -1
  24. data/vendor/kreuzberg/src/api/types.rs +13 -0
  25. data/vendor/kreuzberg/src/chunking/processor.rs +12 -189
  26. data/vendor/kreuzberg/src/core/config/content_filter.rs +73 -0
  27. data/vendor/kreuzberg/src/core/config/extraction/core.rs +48 -0
  28. data/vendor/kreuzberg/src/core/config/extraction/env.rs +149 -37
  29. data/vendor/kreuzberg/src/core/config/extraction/file_config.rs +12 -0
  30. data/vendor/kreuzberg/src/core/config/llm.rs +108 -0
  31. data/vendor/kreuzberg/src/core/config/mod.rs +4 -0
  32. data/vendor/kreuzberg/src/core/config/ocr.rs +32 -0
  33. data/vendor/kreuzberg/src/core/config/processing.rs +32 -0
  34. data/vendor/kreuzberg/src/core/config_validation/mod.rs +2 -2
  35. data/vendor/kreuzberg/src/core/config_validation/sections.rs +131 -1
  36. data/vendor/kreuzberg/src/core/extractor/helpers.rs +1 -20
  37. data/vendor/kreuzberg/src/core/pipeline/format.rs +0 -6
  38. data/vendor/kreuzberg/src/core/pipeline/mod.rs +24 -0
  39. data/vendor/kreuzberg/src/core/pipeline/tests.rs +6 -10
  40. data/vendor/kreuzberg/src/embeddings/mod.rs +246 -72
  41. data/vendor/kreuzberg/src/error.rs +8 -0
  42. data/vendor/kreuzberg/src/extraction/derive.rs +1 -8
  43. data/vendor/kreuzberg/src/extraction/doc/mod.rs +23 -0
  44. data/vendor/kreuzberg/src/extraction/hwp/model.rs +20 -11
  45. data/vendor/kreuzberg/src/extraction/hwp/parser.rs +39 -4
  46. data/vendor/kreuzberg/src/extraction/image_ocr.rs +2 -21
  47. data/vendor/kreuzberg/src/extraction/mod.rs +1 -1
  48. data/vendor/kreuzberg/src/extraction/ppt/mod.rs +17 -3
  49. data/vendor/kreuzberg/src/extraction/pptx/container.rs +14 -1
  50. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +56 -1
  51. data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +3 -44
  52. data/vendor/kreuzberg/src/extraction/transform/mod.rs +7 -136
  53. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +2 -41
  54. data/vendor/kreuzberg/src/extractors/docx.rs +12 -0
  55. data/vendor/kreuzberg/src/extractors/epub/mod.rs +6 -1
  56. data/vendor/kreuzberg/src/extractors/html.rs +45 -3
  57. data/vendor/kreuzberg/src/extractors/odt.rs +12 -1
  58. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +9 -0
  59. data/vendor/kreuzberg/src/extractors/ppt.rs +9 -5
  60. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +15 -2
  61. data/vendor/kreuzberg/src/keywords/processor.rs +13 -160
  62. data/vendor/kreuzberg/src/language_detection/processor.rs +5 -106
  63. data/vendor/kreuzberg/src/lib.rs +13 -4
  64. data/vendor/kreuzberg/src/llm/client.rs +39 -0
  65. data/vendor/kreuzberg/src/llm/mod.rs +15 -0
  66. data/vendor/kreuzberg/src/llm/prompts.rs +52 -0
  67. data/vendor/kreuzberg/src/llm/structured.rs +190 -0
  68. data/vendor/kreuzberg/src/llm/vlm_embeddings.rs +118 -0
  69. data/vendor/kreuzberg/src/llm/vlm_ocr.rs +180 -0
  70. data/vendor/kreuzberg/src/mcp/errors.rs +18 -0
  71. data/vendor/kreuzberg/src/mcp/format.rs +4 -82
  72. data/vendor/kreuzberg/src/mcp/mod.rs +2 -1
  73. data/vendor/kreuzberg/src/mcp/params.rs +38 -0
  74. data/vendor/kreuzberg/src/mcp/server.rs +122 -53
  75. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -35
  76. data/vendor/kreuzberg/src/paddle_ocr/backend.rs +1 -16
  77. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +36 -12
  78. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +1 -16
  79. data/vendor/kreuzberg/src/plugins/mod.rs +3 -52
  80. data/vendor/kreuzberg/src/plugins/ocr.rs +4 -69
  81. data/vendor/kreuzberg/src/plugins/processor/mod.rs +8 -167
  82. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +13 -44
  83. data/vendor/kreuzberg/src/plugins/validator/mod.rs +11 -230
  84. data/vendor/kreuzberg/src/rendering/markdown.rs +1 -1
  85. data/vendor/kreuzberg/src/text/quality_processor.rs +8 -109
  86. data/vendor/kreuzberg/src/types/extraction.rs +9 -0
  87. data/vendor/kreuzberg/tests/llm_integration.rs +295 -0
  88. data/vendor/kreuzberg-ffi/Cargo.toml +4 -3
  89. data/vendor/kreuzberg-ffi/README.md +2 -2
  90. data/vendor/kreuzberg-ffi/kreuzberg.h +71 -7
  91. data/vendor/kreuzberg-ffi/src/config_builder.rs +90 -2
  92. data/vendor/kreuzberg-ffi/src/embedding.rs +94 -0
  93. data/vendor/kreuzberg-ffi/src/error.rs +46 -16
  94. data/vendor/kreuzberg-ffi/src/helpers.rs +28 -75
  95. data/vendor/kreuzberg-ffi/src/lib.rs +9 -3
  96. data/vendor/kreuzberg-ffi/src/memory.rs +4 -0
  97. data/vendor/kreuzberg-ffi/src/panic_shield.rs +2 -0
  98. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -19
  99. data/vendor/kreuzberg-ffi/src/result.rs +1 -15
  100. data/vendor/kreuzberg-ffi/src/result_view.rs +2 -34
  101. data/vendor/kreuzberg-ffi/src/string_intern.rs +9 -0
  102. data/vendor/kreuzberg-ffi/src/types.rs +8 -5
  103. data/vendor/kreuzberg-ffi/tests/c/test_config_builder.c +5 -0
  104. data/vendor/kreuzberg-ffi/tests/c/test_error.c +4 -1
  105. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  106. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  107. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  108. metadata +14 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3fffd5a1e2d066b0997155be101dade834002fd2805fd556b2e5b6b8a8d29be9
4
- data.tar.gz: 0f1ce8406a8880327191fa4e49bdadd71f938ec47ecb3b7e38f4121008d16600
3
+ metadata.gz: c768f2adf392c1598da39e79b20dcc7f0c55774f2d3063c74c6ec72888742bac
4
+ data.tar.gz: 8a350998762668be79e7f4a3843812782f4b0812fdb41327d8d38f1f57c39073
5
5
  SHA512:
6
- metadata.gz: f878aeea0ccb330d30f863707fc31ba0dce4a8b2eb5f6fca705e5d21337a68ea209328a8dc84ae565c20a996d3ae6f5e53149d570a5439f6746f27df3d1c5671
7
- data.tar.gz: 1ca62afb51ffb9d85f3a820c7d44473d393bfdefab21f78723c52d3304082382e30d646c2751203d2f0a0d2ab346ec664856662ba7c212c565b35777fa8f167f
6
+ metadata.gz: 18b2dc0c39199bead5f1f310a38b5f288cba0d8ebb62d31505f13933a0af3bf9179f8377e7a8bd605c39d46e900869a4492f1bd829f853d6f1765263f4affd4d
7
+ data.tar.gz: 769b9c36d5f6722f97f3c3aebb97f1045382a6dbca96405e298c44bc268795d97781edb89d140c1bc2f0e1fb33c5efecb79c2e500469dd7a25fe2105c49081ba
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.4" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -42,7 +42,7 @@
42
42
 
43
43
  <!-- Project Info -->
44
44
  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
45
- <img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
45
+ <img src="https://img.shields.io/badge/License-Elastic--2.0-blue.svg" alt="License">
46
46
  </a>
47
47
  <a href="https://docs.kreuzberg.dev">
48
48
  <img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
@@ -419,7 +419,7 @@ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg
419
419
 
420
420
  ## License
421
421
 
422
- MIT License - see LICENSE file for details.
422
+ Elastic License 2.0 (ELv2) - see [LICENSE](../../LICENSE) for details.
423
423
 
424
424
  ## Support
425
425
 
@@ -1565,9 +1565,9 @@ dependencies = [
1565
1565
 
1566
1566
  [[package]]
1567
1567
  name = "fastrand"
1568
- version = "2.4.0"
1568
+ version = "2.4.1"
1569
1569
  source = "registry+https://github.com/rust-lang/crates.io-index"
1570
- checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f"
1570
+ checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
1571
1571
 
1572
1572
  [[package]]
1573
1573
  name = "fax"
@@ -2792,7 +2792,7 @@ dependencies = [
2792
2792
 
2793
2793
  [[package]]
2794
2794
  name = "kreuzberg"
2795
- version = "4.7.3"
2795
+ version = "4.7.4"
2796
2796
  dependencies = [
2797
2797
  "ahash",
2798
2798
  "async-trait",
@@ -2829,12 +2829,14 @@ dependencies = [
2829
2829
  "kreuzberg-pdfium-render",
2830
2830
  "kreuzberg-tesseract",
2831
2831
  "libc",
2832
+ "liter-llm",
2832
2833
  "log",
2833
2834
  "lopdf",
2834
2835
  "mail-parser",
2835
2836
  "memchr",
2836
2837
  "memmap2",
2837
2838
  "mime_guess",
2839
+ "minijinja",
2838
2840
  "ndarray",
2839
2841
  "num_cpus",
2840
2842
  "once_cell",
@@ -2880,12 +2882,12 @@ dependencies = [
2880
2882
  "utoipa",
2881
2883
  "whatlang",
2882
2884
  "yake-rust",
2883
- "zip 8.5.0",
2885
+ "zip 8.5.1",
2884
2886
  ]
2885
2887
 
2886
2888
  [[package]]
2887
2889
  name = "kreuzberg-ffi"
2888
- version = "4.7.3"
2890
+ version = "4.7.4"
2889
2891
  dependencies = [
2890
2892
  "ahash",
2891
2893
  "async-trait",
@@ -2901,7 +2903,7 @@ dependencies = [
2901
2903
 
2902
2904
  [[package]]
2903
2905
  name = "kreuzberg-paddle-ocr"
2904
- version = "4.7.3"
2906
+ version = "4.7.4"
2905
2907
  dependencies = [
2906
2908
  "geo-clipper",
2907
2909
  "geo-types",
@@ -2915,7 +2917,7 @@ dependencies = [
2915
2917
 
2916
2918
  [[package]]
2917
2919
  name = "kreuzberg-pdfium-render"
2918
- version = "4.7.3"
2920
+ version = "4.7.4"
2919
2921
  dependencies = [
2920
2922
  "bitflags",
2921
2923
  "bytemuck",
@@ -2938,7 +2940,7 @@ dependencies = [
2938
2940
 
2939
2941
  [[package]]
2940
2942
  name = "kreuzberg-rb"
2941
- version = "4.7.3"
2943
+ version = "4.7.4"
2942
2944
  dependencies = [
2943
2945
  "async-trait",
2944
2946
  "html-to-markdown-rs",
@@ -2955,13 +2957,13 @@ dependencies = [
2955
2957
 
2956
2958
  [[package]]
2957
2959
  name = "kreuzberg-tesseract"
2958
- version = "4.7.3"
2960
+ version = "4.7.4"
2959
2961
  dependencies = [
2960
2962
  "cc",
2961
2963
  "cmake",
2962
2964
  "reqwest",
2963
2965
  "thiserror 2.0.18",
2964
- "zip 8.5.0",
2966
+ "zip 8.5.1",
2965
2967
  ]
2966
2968
 
2967
2969
  [[package]]
@@ -3060,6 +3062,27 @@ version = "0.8.2"
3060
3062
  source = "registry+https://github.com/rust-lang/crates.io-index"
3061
3063
  checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
3062
3064
 
3065
+ [[package]]
3066
+ name = "liter-llm"
3067
+ version = "1.2.0"
3068
+ source = "registry+https://github.com/rust-lang/crates.io-index"
3069
+ checksum = "0405bbc5926db49a5f73a4f503d9cac19413416c95e2fd736b1bfa8ce9491660"
3070
+ dependencies = [
3071
+ "base64 0.22.1",
3072
+ "bytes",
3073
+ "futures-core",
3074
+ "memchr",
3075
+ "pin-project-lite",
3076
+ "reqwest",
3077
+ "secrecy",
3078
+ "serde",
3079
+ "serde_json",
3080
+ "thiserror 2.0.18",
3081
+ "tokio",
3082
+ "toml 1.1.2+spec-1.1.0",
3083
+ "tracing",
3084
+ ]
3085
+
3063
3086
  [[package]]
3064
3087
  name = "litrs"
3065
3088
  version = "1.0.0"
@@ -3269,6 +3292,12 @@ dependencies = [
3269
3292
  "libc",
3270
3293
  ]
3271
3294
 
3295
+ [[package]]
3296
+ name = "memo-map"
3297
+ version = "0.3.3"
3298
+ source = "registry+https://github.com/rust-lang/crates.io-index"
3299
+ checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b"
3300
+
3272
3301
  [[package]]
3273
3302
  name = "mime"
3274
3303
  version = "0.3.17"
@@ -3285,6 +3314,16 @@ dependencies = [
3285
3314
  "unicase",
3286
3315
  ]
3287
3316
 
3317
+ [[package]]
3318
+ name = "minijinja"
3319
+ version = "2.19.0"
3320
+ source = "registry+https://github.com/rust-lang/crates.io-index"
3321
+ checksum = "805bfd7352166bae857ee569628b52bcd85a1cecf7810861ebceb1686b72b75d"
3322
+ dependencies = [
3323
+ "memo-map",
3324
+ "serde",
3325
+ ]
3326
+
3288
3327
  [[package]]
3289
3328
  name = "minimal-lexical"
3290
3329
  version = "0.2.1"
@@ -4391,6 +4430,7 @@ dependencies = [
4391
4430
  "hyper-util",
4392
4431
  "js-sys",
4393
4432
  "log",
4433
+ "mime_guess",
4394
4434
  "native-tls",
4395
4435
  "percent-encoding",
4396
4436
  "pin-project-lite",
@@ -4398,16 +4438,21 @@ dependencies = [
4398
4438
  "rustls",
4399
4439
  "rustls-pki-types",
4400
4440
  "rustls-platform-verifier",
4441
+ "serde",
4442
+ "serde_json",
4443
+ "serde_urlencoded",
4401
4444
  "sync_wrapper",
4402
4445
  "tokio",
4403
4446
  "tokio-native-tls",
4404
4447
  "tokio-rustls",
4448
+ "tokio-util",
4405
4449
  "tower",
4406
4450
  "tower-http",
4407
4451
  "tower-service",
4408
4452
  "url",
4409
4453
  "wasm-bindgen",
4410
4454
  "wasm-bindgen-futures",
4455
+ "wasm-streams",
4411
4456
  "web-sys",
4412
4457
  ]
4413
4458
 
@@ -4676,6 +4721,16 @@ version = "1.2.0"
4676
4721
  source = "registry+https://github.com/rust-lang/crates.io-index"
4677
4722
  checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
4678
4723
 
4724
+ [[package]]
4725
+ name = "secrecy"
4726
+ version = "0.10.3"
4727
+ source = "registry+https://github.com/rust-lang/crates.io-index"
4728
+ checksum = "e891af845473308773346dc847b2c23ee78fe442e0472ac50e22a18a93d3ae5a"
4729
+ dependencies = [
4730
+ "serde",
4731
+ "zeroize",
4732
+ ]
4733
+
4679
4734
  [[package]]
4680
4735
  name = "security-framework"
4681
4736
  version = "3.7.0"
@@ -5382,6 +5437,7 @@ dependencies = [
5382
5437
  "bytes",
5383
5438
  "libc",
5384
5439
  "mio",
5440
+ "parking_lot",
5385
5441
  "pin-project-lite",
5386
5442
  "signal-hook-registry",
5387
5443
  "socket2",
@@ -6098,6 +6154,19 @@ dependencies = [
6098
6154
  "wasmparser",
6099
6155
  ]
6100
6156
 
6157
+ [[package]]
6158
+ name = "wasm-streams"
6159
+ version = "0.5.0"
6160
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6161
+ checksum = "9d1ec4f6517c9e11ae630e200b2b65d193279042e28edd4a2cda233e46670bbb"
6162
+ dependencies = [
6163
+ "futures-util",
6164
+ "js-sys",
6165
+ "wasm-bindgen",
6166
+ "wasm-bindgen-futures",
6167
+ "web-sys",
6168
+ ]
6169
+
6101
6170
  [[package]]
6102
6171
  name = "wasmparser"
6103
6172
  version = "0.244.0"
@@ -6788,9 +6857,9 @@ dependencies = [
6788
6857
 
6789
6858
  [[package]]
6790
6859
  name = "zip"
6791
- version = "8.5.0"
6860
+ version = "8.5.1"
6792
6861
  source = "registry+https://github.com/rust-lang/crates.io-index"
6793
- checksum = "2726508a48f38dceb22b35ecbbd2430efe34ff05c62bd3285f965d7911b33464"
6862
+ checksum = "dcab981e19633ebcf0b001ddd37dd802996098bc1864f90b7c5d970ce76c1d59"
6794
6863
  dependencies = [
6795
6864
  "crc32fast",
6796
6865
  "flate2",
@@ -1,10 +1,10 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.7.4"
3
+ version = "4.8.0"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
7
- license = "MIT"
7
+ license = "Elastic-2.0"
8
8
  repository = "https://github.com/kreuzberg-dev/kreuzberg"
9
9
  homepage = "https://kreuzberg.dev"
10
10
  documentation = "https://docs.rs/kreuzberg"
@@ -22,6 +22,7 @@ use kreuzberg::{
22
22
  ImageExtractionConfig, LanguageDetectionConfig, LayoutDetectionConfig, OcrConfig, OutputFormat, PdfConfig,
23
23
  PostProcessorConfig, TokenReductionConfig,
24
24
  };
25
+ use kreuzberg::core::config::ContentFilterConfig;
25
26
  use magnus::value::ReprValue;
26
27
  use magnus::{Error, RArray, RHash, Ruby, TryConvert, Value};
27
28
  use std::fs;
@@ -50,6 +51,8 @@ pub fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
50
51
  auto_rotate: false,
51
52
  pipeline: None,
52
53
  quality_thresholds: None,
54
+ vlm_config: None,
55
+ vlm_prompt: None,
53
56
  };
54
57
 
55
58
  if let Some(val) = get_kw(ruby, hash, "tesseract_config")
@@ -860,6 +863,42 @@ pub fn parse_email_config(ruby: &Ruby, hash: RHash) -> Result<EmailConfig, Error
860
863
  Ok(config)
861
864
  }
862
865
 
866
+ /// Parse ContentFilterConfig from Ruby Hash
867
+ pub fn parse_content_filter_config(ruby: &Ruby, hash: RHash) -> Result<ContentFilterConfig, Error> {
868
+ let include_headers = if let Some(val) = get_kw(ruby, hash, "include_headers") {
869
+ bool::try_convert(val)?
870
+ } else {
871
+ false
872
+ };
873
+
874
+ let include_footers = if let Some(val) = get_kw(ruby, hash, "include_footers") {
875
+ bool::try_convert(val)?
876
+ } else {
877
+ false
878
+ };
879
+
880
+ let strip_repeating_text = if let Some(val) = get_kw(ruby, hash, "strip_repeating_text") {
881
+ bool::try_convert(val)?
882
+ } else {
883
+ true
884
+ };
885
+
886
+ let include_watermarks = if let Some(val) = get_kw(ruby, hash, "include_watermarks") {
887
+ bool::try_convert(val)?
888
+ } else {
889
+ false
890
+ };
891
+
892
+ let config = ContentFilterConfig {
893
+ include_headers,
894
+ include_footers,
895
+ strip_repeating_text,
896
+ include_watermarks,
897
+ };
898
+
899
+ Ok(config)
900
+ }
901
+
863
902
  /// Parse ExtractionConfig from Ruby Hash
864
903
  pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
865
904
  let mut config = ExtractionConfig::default();
@@ -996,6 +1035,13 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
996
1035
  config.email = Some(parse_email_config(ruby, email_hash)?);
997
1036
  }
998
1037
 
1038
+ if let Some(val) = get_kw(ruby, hash, "content_filter")
1039
+ && val.equal(ruby.qnil()).ok() != Some(true)
1040
+ {
1041
+ let content_filter_hash = RHash::try_convert(val)?;
1042
+ config.content_filter = Some(parse_content_filter_config(ruby, content_filter_hash)?);
1043
+ }
1044
+
999
1045
  if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
1000
1046
  let value = usize::try_convert(val)?;
1001
1047
  config.max_concurrent_extractions = Some(value);
@@ -0,0 +1,87 @@
1
+ //! Standalone embedding functions for Ruby.
2
+ //!
3
+ //! Exposes `embed_sync` and `embed` module functions that generate vector embeddings
4
+ //! from a list of text strings using the configured ONNX model.
5
+
6
+ use crate::error_handling::{kreuzberg_error, runtime_error};
7
+ use crate::helpers::ruby_value_to_json;
8
+ use magnus::{Error, RArray, RHash, Ruby, TryConvert, Value, scan_args::get_kwargs, scan_args::scan_args};
9
+ use magnus::value::ReprValue;
10
+
11
+ /// Parse an optional Ruby value (Hash or nil) into a `kreuzberg::EmbeddingConfig`.
12
+ fn parse_embedding_config(ruby: &Ruby, config_val: Option<Value>) -> Result<kreuzberg::EmbeddingConfig, Error> {
13
+ match config_val {
14
+ None => Ok(Default::default()),
15
+ Some(val) => {
16
+ if val.equal(ruby.qnil())? {
17
+ return Ok(Default::default());
18
+ }
19
+ let json = ruby_value_to_json(val)?;
20
+ serde_json::from_value(json)
21
+ .map_err(|e| runtime_error(format!("Invalid embedding config: {}", e)))
22
+ }
23
+ }
24
+ }
25
+
26
+ /// Convert `Vec<Vec<f32>>` to a Ruby Array of Arrays of Floats.
27
+ fn embeddings_to_ruby(ruby: &Ruby, embeddings: Vec<Vec<f32>>) -> Result<RArray, Error> {
28
+ let outer = ruby.ary_new_capa(embeddings.len());
29
+ for inner_vec in embeddings {
30
+ let inner = ruby.ary_new_capa(inner_vec.len());
31
+ for v in inner_vec {
32
+ inner.push(v as f64)?;
33
+ }
34
+ outer.push(inner)?;
35
+ }
36
+ Ok(outer)
37
+ }
38
+
39
+ /// Parse keyword args common to `embed_sync` and `embed`.
40
+ /// Returns `(texts, config)`.
41
+ fn parse_embed_args(
42
+ ruby: &Ruby,
43
+ args: &[Value],
44
+ ) -> Result<(Vec<String>, kreuzberg::EmbeddingConfig), Error> {
45
+ let parsed = scan_args::<(), (), (), (), RHash, ()>(args)?;
46
+ let kw = parsed.keywords;
47
+
48
+ let kw_args = get_kwargs::<_, (Value,), (Option<Value>,), ()>(kw, &["texts"], &["config"])?;
49
+ let (texts_val,) = kw_args.required;
50
+ let (config_opt,) = kw_args.optional;
51
+
52
+ let texts_arr = RArray::try_convert(texts_val)
53
+ .map_err(|_| runtime_error("texts must be an Array".to_string()))?;
54
+ let texts: Vec<String> = texts_arr
55
+ .into_iter()
56
+ .enumerate()
57
+ .map(|(i, v)| {
58
+ String::try_convert(v)
59
+ .map_err(|_| runtime_error(format!("texts[{}] must be a String", i)))
60
+ })
61
+ .collect::<Result<_, _>>()?;
62
+
63
+ let config = parse_embedding_config(ruby, config_opt)?;
64
+ Ok((texts, config))
65
+ }
66
+
67
+ /// Generate embeddings synchronously.
68
+ ///
69
+ /// Keyword args: `texts:` (Array of String), `config:` (Hash, optional)
70
+ /// Returns: Array of Arrays of Float (one per input text).
71
+ pub fn embed_sync(args: &[Value]) -> Result<RArray, Error> {
72
+ let ruby = Ruby::get().expect("Ruby not initialized");
73
+ let (texts, config) = parse_embed_args(&ruby, args)?;
74
+ let embeddings = kreuzberg::embed_texts(&texts, &config).map_err(kreuzberg_error)?;
75
+ embeddings_to_ruby(&ruby, embeddings)
76
+ }
77
+
78
+ /// Generate embeddings (delegates to `embed_sync`).
79
+ ///
80
+ /// Ruby's GVL prevents true async execution, so this simply delegates to
81
+ /// the synchronous implementation to avoid creating a throwaway Tokio runtime.
82
+ ///
83
+ /// Keyword args: `texts:` (Array of String), `config:` (Hash, optional)
84
+ /// Returns: Array of Arrays of Float (one per input text).
85
+ pub fn embed(args: &[Value]) -> Result<RArray, Error> {
86
+ embed_sync(args)
87
+ }
@@ -101,6 +101,13 @@ pub fn kreuzberg_error(err: KreuzbergError) -> Error {
101
101
  )
102
102
  }
103
103
  }
104
+ KreuzbergError::Embedding { message, .. } => {
105
+ if let Some(class) = fetch_error_class("EmbeddingError") {
106
+ Error::new(class, message)
107
+ } else {
108
+ Error::new(ruby.exception_runtime_error(), format!("EmbeddingError: {}", message))
109
+ }
110
+ }
104
111
  other => Error::new(ruby.exception_runtime_error(), other.to_string()),
105
112
  }
106
113
  }
@@ -6,6 +6,7 @@
6
6
  //! Provides extraction, OCR, chunking, and language detection for 30+ file formats.
7
7
 
8
8
  // Module declarations
9
+ mod embedding;
9
10
  mod error_handling;
10
11
  mod gc_guarded_value;
11
12
  mod helpers;
@@ -457,6 +458,10 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
457
458
  module.define_module_function("batch_extract_files", function!(batch_extract_files, -1))?;
458
459
  module.define_module_function("batch_extract_bytes", function!(batch_extract_bytes, -1))?;
459
460
 
461
+ // Embedding functions
462
+ module.define_module_function("embed_sync", function!(embedding::embed_sync, -1))?;
463
+ module.define_module_function("embed", function!(embedding::embed, -1))?;
464
+
460
465
  // PDF page iterator
461
466
  module.define_module_function("native_render_pdf_pages_iter", function!(render_pdf_pages_iter, 2))?;
462
467
  module.define_module_function("native_render_pdf_page", function!(native_render_pdf_page, 3))?;
@@ -110,6 +110,7 @@ impl OcrBackend for RubyOcrBackend {
110
110
  formatted_content: None,
111
111
  uris: None,
112
112
  ocr_internal_document: None,
113
+ structured_output: None,
113
114
  })
114
115
  })
115
116
  }
@@ -856,6 +856,40 @@ module Kreuzberg
856
856
  end
857
857
  end
858
858
 
859
+ # Content filter configuration for controlling extraction of headers, footers,
860
+ # watermarks, and repeating text across document formats.
861
+ #
862
+ # @example Include headers and footers
863
+ # filter = ContentFilter.new(include_headers: true, include_footers: true)
864
+ #
865
+ # @example Disable repeating text removal
866
+ # filter = ContentFilter.new(strip_repeating_text: false)
867
+ #
868
+ class ContentFilter
869
+ attr_reader :include_headers, :include_footers, :strip_repeating_text, :include_watermarks
870
+
871
+ def initialize(
872
+ include_headers: false,
873
+ include_footers: false,
874
+ strip_repeating_text: true,
875
+ include_watermarks: false
876
+ )
877
+ @include_headers = include_headers ? true : false
878
+ @include_footers = include_footers ? true : false
879
+ @strip_repeating_text = strip_repeating_text ? true : false
880
+ @include_watermarks = include_watermarks ? true : false
881
+ end
882
+
883
+ def to_h
884
+ {
885
+ include_headers: @include_headers,
886
+ include_footers: @include_footers,
887
+ strip_repeating_text: @strip_repeating_text,
888
+ include_watermarks: @include_watermarks
889
+ }
890
+ end
891
+ end
892
+
859
893
  # Layout detection configuration
860
894
  #
861
895
  # @example Basic usage
@@ -951,7 +985,7 @@ module Kreuzberg
951
985
  :max_concurrent_extractions, :output_format, :result_format,
952
986
  :security_limits, :layout, :concurrency,
953
987
  :cache_namespace, :cache_ttl_secs, :extraction_timeout_secs,
954
- :max_archive_depth, :acceleration, :email
988
+ :max_archive_depth, :acceleration, :email, :content_filter
955
989
 
956
990
  # Alias for backward compatibility - image_extraction is the canonical name
957
991
  alias image_extraction images
@@ -977,7 +1011,7 @@ module Kreuzberg
977
1011
  postprocessor token_reduction keywords html_options pages
978
1012
  max_concurrent_extractions output_format result_format
979
1013
  security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
980
- max_archive_depth acceleration email
1014
+ max_archive_depth acceleration email content_filter
981
1015
  ].freeze
982
1016
 
983
1017
  # Aliases for backward compatibility
@@ -1062,7 +1096,8 @@ module Kreuzberg
1062
1096
  extraction_timeout_secs: nil,
1063
1097
  max_archive_depth: 3,
1064
1098
  acceleration: nil,
1065
- email: nil)
1099
+ email: nil,
1100
+ content_filter: nil)
1066
1101
  kwargs = {
1067
1102
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
1068
1103
  force_ocr: force_ocr, disable_ocr: disable_ocr, force_ocr_pages: force_ocr_pages,
@@ -1080,7 +1115,8 @@ module Kreuzberg
1080
1115
  extraction_timeout_secs: extraction_timeout_secs,
1081
1116
  max_archive_depth: max_archive_depth,
1082
1117
  acceleration: acceleration,
1083
- email: email
1118
+ email: email,
1119
+ content_filter: content_filter
1084
1120
  }
1085
1121
  extracted = extract_from_hash(hash, kwargs)
1086
1122
 
@@ -1115,6 +1151,7 @@ module Kreuzberg
1115
1151
  @concurrency = normalize_config(params[:concurrency], Concurrency)
1116
1152
  @acceleration = normalize_config(params[:acceleration], Acceleration)
1117
1153
  @email = normalize_config(params[:email], Email)
1154
+ @content_filter = normalize_config(params[:content_filter], ContentFilter)
1118
1155
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
1119
1156
  @max_archive_depth = params[:max_archive_depth]&.to_i || 3
1120
1157
  @output_format = validate_output_format(params[:output_format])
@@ -1175,7 +1212,8 @@ module Kreuzberg
1175
1212
  token_reduction: @token_reduction&.to_h, keywords: @keywords&.to_h,
1176
1213
  html_options: @html_options&.to_h, pages: @pages&.to_h,
1177
1214
  layout: @layout&.to_h, concurrency: @concurrency&.to_h,
1178
- acceleration: @acceleration&.to_h, email: @email&.to_h
1215
+ acceleration: @acceleration&.to_h, email: @email&.to_h,
1216
+ content_filter: @content_filter&.to_h
1179
1217
  }
1180
1218
  end
1181
1219
 
@@ -11,6 +11,7 @@ module Kreuzberg
11
11
  ERROR_CODE_PARSING = 5
12
12
  ERROR_CODE_OCR = 6
13
13
  ERROR_CODE_MISSING_DEPENDENCY = 7
14
+ ERROR_CODE_EMBEDDING = 8
14
15
 
15
16
  module Errors
16
17
  class PanicContext
@@ -112,5 +113,8 @@ module Kreuzberg
112
113
 
113
114
  # Raised when an unsupported file format or MIME type is encountered
114
115
  class UnsupportedFormatError < Error; end
116
+
117
+ # Raised when embedding fails
118
+ class EmbeddingError < Error; end
115
119
  end
116
120
  end
@@ -236,6 +236,41 @@ module Kreuzberg
236
236
  results
237
237
  end
238
238
 
239
+ # Asynchronously generate embeddings for multiple texts.
240
+ #
241
+ # Non-blocking embedding generation from a list of strings.
242
+ #
243
+ # @param texts [Array<String>] List of strings to embed.
244
+ # @param config [Config::Embedding, Hash, nil] Embedding configuration.
245
+ #
246
+ # @return [Array<Array<Float>>] Array of embedding vectors.
247
+ #
248
+ # @raise [Errors::EmbeddingError] If embedding generation fails.
249
+ #
250
+ # @example Generate embeddings asynchronously
251
+ # texts = ["Hello, world!", "Kreuzberg is awesome."]
252
+ # embeddings = Kreuzberg.embed(texts: texts)
253
+ # puts embeddings.first.length # 384
254
+ def embed(texts:, config: nil)
255
+ opts = normalize_config(config)
256
+ native_embed(texts: texts.map(&:to_s), config: opts)
257
+ end
258
+
259
+ # Synchronously generate embeddings for multiple texts.
260
+ #
261
+ # Blocking embedding generation from a list of strings.
262
+ #
263
+ # @param texts [Array<String>] List of strings to embed.
264
+ # @param config [Config::Embedding, Hash, nil] Embedding configuration.
265
+ #
266
+ # @return [Array<Array<Float>>] Array of embedding vectors.
267
+ #
268
+ # @raise [Errors::EmbeddingError] If embedding generation fails.
269
+ def embed_sync(texts:, config: nil)
270
+ opts = normalize_config(config)
271
+ native_embed_sync(texts: texts.map(&:to_s), config: opts)
272
+ end
273
+
239
274
  # Synchronously extract content from multiple byte data sources.
240
275
  #
241
276
  # Processes multiple in-memory binary documents in a single batch operation. Results
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.7.4'
4
+ VERSION = '4.8.0'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -59,10 +59,13 @@ module Kreuzberg
59
59
  alias native_batch_extract_bytes batch_extract_bytes
60
60
  alias native_clear_cache clear_cache
61
61
  alias native_cache_stats cache_stats
62
+ alias native_embed_sync embed_sync
63
+ alias native_embed embed
62
64
 
63
65
  private :native_extract_file_sync, :native_extract_bytes_sync, :native_batch_extract_files_sync
64
66
  private :native_extract_file, :native_extract_bytes, :native_batch_extract_files
65
67
  private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
68
+ private :native_embed_sync, :native_embed
66
69
  end
67
70
 
68
71
  module_function :register_post_processor
@@ -94,6 +97,10 @@ module Kreuzberg
94
97
  module_function :validate_mime_type
95
98
 
96
99
  module_function :get_extensions_for_mime
100
+
101
+ module_function :embed_sync
102
+
103
+ module_function :embed
97
104
  end
98
105
 
99
106
  require_relative 'kreuzberg/cache_api'