kreuzberg 4.0.0.pre.rc.17 → 4.0.0.pre.rc.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.lock +64 -2
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/ext/kreuzberg_rb/native/build.rs +6 -4
  6. data/lib/kreuzberg/version.rb +1 -1
  7. data/vendor/Cargo.toml +2 -2
  8. data/vendor/kreuzberg/Cargo.toml +1 -1
  9. data/vendor/kreuzberg-ffi/Cargo.toml +74 -0
  10. data/vendor/kreuzberg-ffi/README.md +851 -0
  11. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +230 -0
  12. data/vendor/kreuzberg-ffi/build.rs +176 -0
  13. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  14. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  15. data/vendor/kreuzberg-ffi/kreuzberg.h +2959 -0
  16. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +626 -0
  17. data/vendor/kreuzberg-ffi/src/config.rs +1050 -0
  18. data/vendor/kreuzberg-ffi/src/error.rs +950 -0
  19. data/vendor/kreuzberg-ffi/src/lib.rs +4107 -0
  20. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  21. data/vendor/kreuzberg-ffi/src/result.rs +517 -0
  22. data/vendor/kreuzberg-ffi/src/result_pool.rs +675 -0
  23. data/vendor/kreuzberg-ffi/src/result_view.rs +815 -0
  24. data/vendor/kreuzberg-ffi/src/string_intern.rs +596 -0
  25. data/vendor/kreuzberg-ffi/src/validation.rs +938 -0
  26. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  27. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  28. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  29. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  30. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  31. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  32. metadata +24 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f91977b1472bb6211f3ac2efad274e8cbc77dc5ed9832529eccbebeae1f74b4f
4
- data.tar.gz: b8a32377a80cfec656e8ddd65576dc220f497fc65b7b45b307db54a0b3b4a274
3
+ metadata.gz: a1c1d7ddcc45217bf5d9ea47a4d9d3ef9f41ed5a4bd87f4ff1f2ada7cfe0bca6
4
+ data.tar.gz: 167fb6c623c9e4368bcd2388e0ef4631d170d0e167cc0555c2dc7cd814bff9eb
5
5
  SHA512:
6
- metadata.gz: cb391d9f82848e0b19b0c8df2cce7db455d1b73ba5e5c6dd63a2cc87732d8dd0cd6596ca7f9b305061d9400db95c5890292efbd16af7e55a9434f3f29a337642
7
- data.tar.gz: 9e41afcc217e00d9feb3f8c4adecb7152743227f2c77f4bcfd9fd5e3d4b64b01171d3bdbb2b1290e10d2efdf13cdb53feb8fee01a95b6ba4d87ea76425b56692
6
+ metadata.gz: 37661576ba03012b1549c2388e0aee4c24cdb0e05cb34164fc85ca654c20236b28829678531c50fe46260433c43907c3f03cf516753f12c78ee2026a0b14a446
7
+ data.tar.gz: 96e1b3b10589fa7f47fc9609158da556e7d889a61963953c72e5f92337897828d68934d8dc0a66d7b2e28774204b6b1bf2c95226d5c26c84a98a01ce797e24df
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.17)
4
+ kreuzberg (4.0.0.pre.rc.18)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -455,6 +455,18 @@ dependencies = [
455
455
  "core2",
456
456
  ]
457
457
 
458
+ [[package]]
459
+ name = "bitvec"
460
+ version = "1.0.1"
461
+ source = "registry+https://github.com/rust-lang/crates.io-index"
462
+ checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
463
+ dependencies = [
464
+ "funty",
465
+ "radium",
466
+ "tap",
467
+ "wyz",
468
+ ]
469
+
458
470
  [[package]]
459
471
  name = "blake3"
460
472
  version = "1.8.2"
@@ -987,6 +999,20 @@ dependencies = [
987
999
  "serde",
988
1000
  ]
989
1001
 
1002
+ [[package]]
1003
+ name = "dashmap"
1004
+ version = "6.1.0"
1005
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1006
+ checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
1007
+ dependencies = [
1008
+ "cfg-if",
1009
+ "crossbeam-utils",
1010
+ "hashbrown 0.14.5",
1011
+ "lock_api",
1012
+ "once_cell",
1013
+ "parking_lot_core",
1014
+ ]
1015
+
990
1016
  [[package]]
991
1017
  name = "debug_unsafe"
992
1018
  version = "0.1.3"
@@ -1434,6 +1460,12 @@ dependencies = [
1434
1460
  "windows-sys 0.59.0",
1435
1461
  ]
1436
1462
 
1463
+ [[package]]
1464
+ name = "funty"
1465
+ version = "2.0.0"
1466
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1467
+ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
1468
+
1437
1469
  [[package]]
1438
1470
  name = "futf"
1439
1471
  version = "0.1.5"
@@ -1638,6 +1670,12 @@ dependencies = [
1638
1670
  "zerocopy",
1639
1671
  ]
1640
1672
 
1673
+ [[package]]
1674
+ name = "hashbrown"
1675
+ version = "0.14.5"
1676
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1677
+ checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
1678
+
1641
1679
  [[package]]
1642
1680
  name = "hashbrown"
1643
1681
  version = "0.15.5"
@@ -2280,7 +2318,7 @@ dependencies = [
2280
2318
 
2281
2319
  [[package]]
2282
2320
  name = "kreuzberg"
2283
- version = "4.0.0-rc.14"
2321
+ version = "4.0.0-rc.17"
2284
2322
  dependencies = [
2285
2323
  "ahash",
2286
2324
  "async-trait",
@@ -2288,8 +2326,10 @@ dependencies = [
2288
2326
  "base64 0.22.1",
2289
2327
  "base64-simd",
2290
2328
  "biblatex",
2329
+ "bitvec",
2291
2330
  "calamine",
2292
2331
  "chardetng",
2332
+ "dashmap",
2293
2333
  "docx-lite",
2294
2334
  "encoding_rs",
2295
2335
  "fast_image_resize",
@@ -2334,6 +2374,7 @@ dependencies = [
2334
2374
  "serde_json",
2335
2375
  "serde_yaml_ng",
2336
2376
  "sevenz-rust2",
2377
+ "simdutf8",
2337
2378
  "tar",
2338
2379
  "text-splitter",
2339
2380
  "thiserror 2.0.17",
@@ -2368,7 +2409,7 @@ dependencies = [
2368
2409
 
2369
2410
  [[package]]
2370
2411
  name = "kreuzberg-tesseract"
2371
- version = "4.0.0-rc.14"
2412
+ version = "4.0.0-rc.17"
2372
2413
  dependencies = [
2373
2414
  "cc",
2374
2415
  "cmake",
@@ -4124,6 +4165,12 @@ version = "5.3.0"
4124
4165
  source = "registry+https://github.com/rust-lang/crates.io-index"
4125
4166
  checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
4126
4167
 
4168
+ [[package]]
4169
+ name = "radium"
4170
+ version = "0.7.0"
4171
+ source = "registry+https://github.com/rust-lang/crates.io-index"
4172
+ checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
4173
+
4127
4174
  [[package]]
4128
4175
  name = "rake"
4129
4176
  version = "0.3.6"
@@ -5218,6 +5265,12 @@ dependencies = [
5218
5265
  "syn",
5219
5266
  ]
5220
5267
 
5268
+ [[package]]
5269
+ name = "tap"
5270
+ version = "1.0.1"
5271
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5272
+ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
5273
+
5221
5274
  [[package]]
5222
5275
  name = "tar"
5223
5276
  version = "0.4.44"
@@ -6439,6 +6492,15 @@ version = "0.6.2"
6439
6492
  source = "registry+https://github.com/rust-lang/crates.io-index"
6440
6493
  checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
6441
6494
 
6495
+ [[package]]
6496
+ name = "wyz"
6497
+ version = "0.5.1"
6498
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6499
+ checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
6500
+ dependencies = [
6501
+ "tap",
6502
+ ]
6503
+
6442
6504
  [[package]]
6443
6505
  name = "xattr"
6444
6506
  version = "1.6.1"
@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
7
7
 
8
8
  [package]
9
9
  name = "kreuzberg-rb"
10
- version = "4.0.0-rc.17"
10
+ version = "4.0.0-rc.18"
11
11
  edition = "2024"
12
12
  rust-version = "1.91"
13
13
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -54,15 +54,17 @@ fn main() {
54
54
 
55
55
  // Fallback: Add search paths and use standard linking
56
56
  for dir in [host_lib_dir, target_lib_dir] {
57
- if dir.exists() {
58
- println!("cargo:rustc-link-search=native={}", dir.display());
59
- }
57
+ println!("cargo:rustc-link-search=native={}", dir.display());
60
58
  }
61
59
  }
62
60
 
63
61
  // Link the kreuzberg-ffi library
64
62
  // When kreuzberg-ffi is built, its symbols become available for linking
65
- println!("cargo:rustc-link-lib=static=kreuzberg_ffi");
63
+ if target.contains("windows") {
64
+ println!("cargo:rustc-link-lib=dylib=kreuzberg_ffi");
65
+ } else {
66
+ println!("cargo:rustc-link-lib=static=kreuzberg_ffi");
67
+ }
66
68
 
67
69
  if target.contains("darwin") {
68
70
  println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.0-rc.17'
4
+ VERSION = '4.0.0-rc.18'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -1,8 +1,8 @@
1
1
  [workspace]
2
- members = ["kreuzberg", "kreuzberg-tesseract"]
2
+ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.0.0-rc.17"
5
+ version = "4.0.0-rc.18"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.0.0-rc.17"
3
+ version = "4.0.0-rc.18"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -0,0 +1,74 @@
1
+ [package]
2
+ name = "kreuzberg-ffi"
3
+ version.workspace = true
4
+ edition.workspace = true
5
+ rust-version.workspace = true
6
+ authors.workspace = true
7
+ description = "C FFI bindings for Kreuzberg document intelligence library"
8
+ license.workspace = true
9
+ repository = "https://github.com/kreuzberg-dev/kreuzberg"
10
+ homepage = "https://kreuzberg.dev"
11
+ documentation = "https://docs.rs/kreuzberg-ffi"
12
+ readme = "README.md"
13
+ keywords = ["ffi", "bindings", "document", "extraction", "api"]
14
+ categories = ["development-tools::ffi", "text-processing"]
15
+
16
+ [lib]
17
+ # cdylib: Required by Java (FFM API) and Go (cgo dynamic linking)
18
+ # staticlib: Required by Python (PyO3 static linking) to avoid dylib install_name issues on macOS
19
+ # rlib: Standard Rust library format for workspace dependencies
20
+ crate-type = ["cdylib", "staticlib", "rlib"]
21
+
22
+ [features]
23
+ # Mirror embeddings feature availability from kreuzberg dependency
24
+ embeddings = []
25
+ # Optional rayon for parallel batch processing
26
+ rayon = ["dep:rayon"]
27
+ # Re-export kreuzberg features for downstream conditional compilation
28
+ pdf = []
29
+ keywords-yake = []
30
+ keywords-rake = []
31
+
32
+ [dependencies]
33
+ serde_json = { workspace = true }
34
+ serde = { workspace = true }
35
+ async-trait = { workspace = true }
36
+ tokio = { workspace = true }
37
+ html-to-markdown-rs = { version = "2.16.1", default-features = false }
38
+ rayon = { version = "1.11", optional = true }
39
+
40
+ # On Windows MinGW, disable embeddings/ort since ONNX Runtime is not available
41
+ # in MinGW-compatible form. Use all other features but exclude embeddings.
42
+ [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
43
+ kreuzberg = { path = "../kreuzberg", features = [
44
+ "pdf",
45
+ "excel",
46
+ "office",
47
+ "email",
48
+ "html",
49
+ "xml",
50
+ "archives",
51
+ "ocr",
52
+ "language-detection",
53
+ "chunking",
54
+ "quality",
55
+ "keywords",
56
+ "api",
57
+ "mcp",
58
+ "otel",
59
+ "bundled-pdfium",
60
+ ] }
61
+
62
+ [target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
63
+ kreuzberg = { path = "../kreuzberg", features = ["full", "bundled-pdfium"] }
64
+
65
+ [build-dependencies]
66
+ cbindgen = "0.29"
67
+
68
+ [dev-dependencies]
69
+ tempfile = { workspace = true }
70
+ criterion = { version = "0.8", features = ["html_reports"] }
71
+
72
+ [[bench]]
73
+ name = "result_view_benchmark"
74
+ harness = false