kreuzberg 4.8.3 → 4.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8b636a0d3207a655747e4b288b0f66b616b460b282f49d2b4947ec23ac319cfc
4
- data.tar.gz: ef5522acd061db0093d68afb96d440888f605aefaa6e0cc262f248e2e82e62cb
3
+ metadata.gz: 16deeaa47cb35ded0b844af72d43b74da5539084f21a79d17513be2da9ac2f0b
4
+ data.tar.gz: 64715b14cffac78a796853e9f5d9a2d0969427de9d59a243c87a5d20699dcce3
5
5
  SHA512:
6
- metadata.gz: 89b72104facd4e87ecc5c65011305abdcd77fd829073dcd235829d67297ad17b09136bb9c2df0dd3dcf7e79ba2f86bba6ceb69fdbf22b783725555b4efa375b3
7
- data.tar.gz: c82c2c92b595613497385af930f497762d56831818cda45d230044a61fbbe5730da4d7781cb956fa6096b53e364dfff9772d3517fe2cc879785d53659a45d5d0
6
+ metadata.gz: e362717e5db0fad6a9494737e53c2444a84cb76fd274c70283a6650eef0891e9ced2af424b2ed9501eb749f21fcfb2ca3b4f8c7b336d1a248bb99f4a7e69131e
7
+ data.tar.gz: 5d05d862a170f0efe0f6f6a9867846bb3b000136f638b1efe6ddee5e94310dc92495a40a7dff204b4098f92989175e1691ba10897be884ca960477d48dcbc6ca
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.3" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.4" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1601,12 +1601,9 @@ dependencies = [
1601
1601
 
1602
1602
  [[package]]
1603
1603
  name = "fearless_simd"
1604
- version = "0.3.0"
1604
+ version = "0.4.0"
1605
1605
  source = "registry+https://github.com/rust-lang/crates.io-index"
1606
- checksum = "8fb2907d1f08b2b316b9223ced5b0e89d87028ba8deae9764741dba8ff7f3903"
1607
- dependencies = [
1608
- "bytemuck",
1609
- ]
1606
+ checksum = "76258897e51fd156ee03b6246ea53f3e0eb395d0b327e9961c4fc4c8b2fa151a"
1610
1607
 
1611
1608
  [[package]]
1612
1609
  name = "filetime"
@@ -2065,9 +2062,9 @@ dependencies = [
2065
2062
 
2066
2063
  [[package]]
2067
2064
  name = "hayro-jpeg2000"
2068
- version = "0.3.4"
2065
+ version = "0.3.5"
2069
2066
  source = "registry+https://github.com/rust-lang/crates.io-index"
2070
- checksum = "c1a74cfc18c0093ef8009a0d6c1ba3024df0cce228503a14c1372e1e23eed43e"
2067
+ checksum = "c75ab947623ef4ccaa7acf0579edf7cbb5a73838e3839a7be73335e522f433a1"
2071
2068
  dependencies = [
2072
2069
  "fearless_simd",
2073
2070
  ]
@@ -2251,15 +2248,14 @@ dependencies = [
2251
2248
 
2252
2249
  [[package]]
2253
2250
  name = "hyper-rustls"
2254
- version = "0.27.7"
2251
+ version = "0.27.8"
2255
2252
  source = "registry+https://github.com/rust-lang/crates.io-index"
2256
- checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
2253
+ checksum = "c2b52f86d1d4bc0d6b4e6826d960b1b333217e07d36b882dca570a5e1c48895b"
2257
2254
  dependencies = [
2258
2255
  "http",
2259
2256
  "hyper",
2260
2257
  "hyper-util",
2261
2258
  "rustls",
2262
- "rustls-pki-types",
2263
2259
  "tokio",
2264
2260
  "tokio-rustls",
2265
2261
  "tower-service",
@@ -2935,7 +2931,7 @@ dependencies = [
2935
2931
 
2936
2932
  [[package]]
2937
2933
  name = "kreuzberg-rb"
2938
- version = "4.8.3"
2934
+ version = "4.8.4"
2939
2935
  dependencies = [
2940
2936
  "async-trait",
2941
2937
  "html-to-markdown-rs",
@@ -2987,9 +2983,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7"
2987
2983
 
2988
2984
  [[package]]
2989
2985
  name = "libc"
2990
- version = "0.2.184"
2986
+ version = "0.2.185"
2991
2987
  source = "registry+https://github.com/rust-lang/crates.io-index"
2992
- checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af"
2988
+ checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
2993
2989
 
2994
2990
  [[package]]
2995
2991
  name = "libfuzzer-sys"
@@ -3644,9 +3640,9 @@ dependencies = [
3644
3640
 
3645
3641
  [[package]]
3646
3642
  name = "openssl"
3647
- version = "0.10.76"
3643
+ version = "0.10.77"
3648
3644
  source = "registry+https://github.com/rust-lang/crates.io-index"
3649
- checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf"
3645
+ checksum = "bfe4646e360ec77dff7dde40ed3d6c5fee52d156ef4a62f53973d38294dad87f"
3650
3646
  dependencies = [
3651
3647
  "bitflags",
3652
3648
  "cfg-if",
@@ -3676,9 +3672,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
3676
3672
 
3677
3673
  [[package]]
3678
3674
  name = "openssl-sys"
3679
- version = "0.9.112"
3675
+ version = "0.9.113"
3680
3676
  source = "registry+https://github.com/rust-lang/crates.io-index"
3681
- checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb"
3677
+ checksum = "ad2f2c0eba47118757e4c6d2bff2838f3e0523380021356e7875e858372ce644"
3682
3678
  dependencies = [
3683
3679
  "cc",
3684
3680
  "libc",
@@ -3873,9 +3869,9 @@ checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590"
3873
3869
 
3874
3870
  [[package]]
3875
3871
  name = "pkg-config"
3876
- version = "0.3.32"
3872
+ version = "0.3.33"
3877
3873
  source = "registry+https://github.com/rust-lang/crates.io-index"
3878
- checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
3874
+ checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
3879
3875
 
3880
3876
  [[package]]
3881
3877
  name = "plain"
@@ -4564,9 +4560,9 @@ dependencies = [
4564
4560
 
4565
4561
  [[package]]
4566
4562
  name = "rustls"
4567
- version = "0.23.37"
4563
+ version = "0.23.38"
4568
4564
  source = "registry+https://github.com/rust-lang/crates.io-index"
4569
- checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
4565
+ checksum = "69f9466fb2c14ea04357e91413efb882e2a6d4a406e625449bc0a5d360d53a21"
4570
4566
  dependencies = [
4571
4567
  "aws-lc-rs",
4572
4568
  "log",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.8.3"
3
+ version = "4.8.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.8.3'
4
+ VERSION = '4.8.4'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.8.3"
5
+ version = "4.8.4"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -23,7 +23,7 @@ clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
23
23
  comrak = { version = "0.52", default-features = false }
24
24
  console_error_panic_hook = "0.1"
25
25
  criterion = { version = "0.8", features = ["html_reports"] }
26
- ctor = "0.9"
26
+ ctor = "0.10"
27
27
  dbase = "0.7"
28
28
  futures = "0.3"
29
29
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
@@ -32,10 +32,10 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.8.3", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.3" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.8.4", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.4" }
37
37
  lazy_static = "1.5.0"
38
- libc = "0.2.184"
38
+ libc = "0.2.185"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
40
40
  log = "0.4"
41
41
  lzma-rust2 = { version = "0.16.2" }
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.24", default-features = false }
48
+ pdf_oxide = { version = "0.3.30", default-features = false }
49
49
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
50
  rayon = "1.11.0"
51
51
  reqwest = { version = "0.13.2", default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.8.3"
3
+ version = "4.8.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -265,7 +265,7 @@ dirs = "6"
265
265
  encoding_rs = { version = "0.8.35" }
266
266
  fast_image_resize = { version = "6.0.0", optional = true }
267
267
  flate2 = { version = "1.1", optional = true }
268
- hayro-jbig2 = { version = "0.2", default-features = false, features = ["std"], optional = true }
268
+ hayro-jbig2 = { version = "0.3", default-features = false, features = ["std"], optional = true }
269
269
  hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
270
270
  "std",
271
271
  "simd",
@@ -291,7 +291,7 @@ jotdown = "0.9"
291
291
  kamadak-exif = { version = "0.6.1", optional = true }
292
292
 
293
293
  kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
294
- libc = "0.2.184"
294
+ libc = "0.2.185"
295
295
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false, optional = true }
296
296
  log = "0.4"
297
297
  lopdf = { version = "0.40.0", optional = true }
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
314
  outlook-pst = { version = "1.2.0", optional = true }
315
315
  parking_lot = "0.12.5"
316
316
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.24", default-features = false, optional = true }
317
+ pdf_oxide = { version = "0.3.30", default-features = false, optional = true }
318
318
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
319
  pulldown-cmark = { version = "0.13" }
320
320
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.8.3 Release**
21
+ > **🚀 Version 4.8.4 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -252,8 +252,11 @@ fn build_inlines<'a>(
252
252
  let mut pos: u32 = 0;
253
253
 
254
254
  for ann in &sorted {
255
- let start = ann.start.min(len);
256
- let end = ann.end.min(len);
255
+ // Clamp to text length, then snap to valid UTF-8 char boundaries.
256
+ // Annotation byte offsets can land inside multi-byte characters
257
+ // (e.g. Cyrillic «»), which would panic on slice indexing.
258
+ let start = text.ceil_char_boundary(ann.start.min(len) as usize) as u32;
259
+ let end = text.floor_char_boundary(ann.end.min(len) as usize) as u32;
257
260
 
258
261
  // Skip overlapping annotations.
259
262
  if start < pos {
@@ -266,7 +269,13 @@ fn build_inlines<'a>(
266
269
  continue;
267
270
  }
268
271
 
272
+ // Skip degenerate annotations where boundary snapping collapsed the range.
273
+ if start >= end {
274
+ continue;
275
+ }
276
+
269
277
  // Gap text before this annotation.
278
+ // `pos` is always on a char boundary (starts at 0, updated to floor-snapped `end`).
270
279
  if start > pos {
271
280
  let gap = &text[pos as usize..start as usize];
272
281
  if !gap.is_empty() {
@@ -1162,6 +1171,41 @@ mod tests {
1162
1171
  assert!(out.contains("*Hello World*"), "got: {}", out);
1163
1172
  }
1164
1173
 
1174
+ #[test]
1175
+ fn test_annotation_on_multibyte_char_boundary() {
1176
+ // Regression: annotation byte offsets that land inside a multi-byte
1177
+ // UTF-8 character (e.g. Cyrillic «») must not panic.
1178
+ // «ярко»: each char is 2 bytes → « 0..2, я 2..4, р 4..6, к 6..8, о 8..10, » 10..12.
1179
+ // Annotation starts at byte 1 (inside «) and ends at byte 11 (inside »).
1180
+ let mut b = InternalDocumentBuilder::new("test");
1181
+ let ann = vec![TextAnnotation {
1182
+ start: 1,
1183
+ end: 11,
1184
+ kind: AnnotationKind::Bold,
1185
+ }];
1186
+ b.push_paragraph("«ярко»", ann, None, None);
1187
+ let doc = b.build();
1188
+ let out = render(&doc);
1189
+ assert!(out.contains("ярко"), "Cyrillic content should be present, got: {}", out);
1190
+ }
1191
+
1192
+ #[test]
1193
+ fn test_annotation_on_valid_multibyte_boundaries() {
1194
+ // Annotations on correct char boundaries must still produce formatting.
1195
+ let mut b = InternalDocumentBuilder::new("test");
1196
+ // "Привет" = 12 bytes, " " = 1, "мир" = 6 → 19 bytes total.
1197
+ let ann = vec![TextAnnotation {
1198
+ start: 0,
1199
+ end: 12,
1200
+ kind: AnnotationKind::Bold,
1201
+ }];
1202
+ b.push_paragraph("Привет мир", ann, None, None);
1203
+ let doc = b.build();
1204
+ let out = render(&doc);
1205
+ assert!(out.contains("**Привет**"), "got: {}", out);
1206
+ assert!(out.contains("мир"), "got: {}", out);
1207
+ }
1208
+
1165
1209
  #[test]
1166
1210
  fn test_footnote() {
1167
1211
  let mut b = InternalDocumentBuilder::new("test");
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-ffi"
3
- version = "4.8.3"
3
+ version = "4.8.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -26,40 +26,15 @@ rayon = ["dep:rayon"]
26
26
  [dependencies]
27
27
  ahash = { version = "0.8.12", features = ["serde"] }
28
28
  async-trait = "0.1.89"
29
- ctor = "0.9"
29
+ ctor = "0.10"
30
30
  html-to-markdown-rs = { version = "3.1.0", default-features = false }
31
+ kreuzberg = { path = "../kreuzberg", version = "4.8.4", default-features = false, features = ["bundled-pdfium", "full"] }
31
32
  log = "0.4"
32
33
  rayon = { version = "1.11.0", optional = true }
33
34
  serde = { version = "1.0.228", features = ["derive"] }
34
35
  serde_json = { version = "1.0.149" }
35
36
  tokio = { version = "1.51.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
36
37
 
37
- [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
38
- kreuzberg = { path = "../kreuzberg", version = "4.8.3", default-features = false, features = [
39
- "pdf",
40
- "excel",
41
- "office",
42
- "email",
43
- "html",
44
- "xml",
45
- "archives",
46
- "ocr",
47
- "language-detection",
48
- "chunking",
49
- "chunking-tokenizers",
50
- "embeddings",
51
- "quality",
52
- "keywords",
53
- "api",
54
- "mcp",
55
- "otel",
56
- "tree-sitter",
57
- "bundled-pdfium",
58
- ] }
59
-
60
- [target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
61
- kreuzberg = { path = "../kreuzberg", version = "4.8.3", default-features = false, features = ["bundled-pdfium", "full"] }
62
-
63
38
  [build-dependencies]
64
39
  cbindgen = "0.29"
65
40
 
@@ -9,8 +9,8 @@
9
9
 
10
10
  #define KREUZBERG_VERSION_MAJOR 4
11
11
  #define KREUZBERG_VERSION_MINOR 8
12
- #define KREUZBERG_VERSION_PATCH 3
13
- #define KREUZBERG_VERSION "4.8.3"
12
+ #define KREUZBERG_VERSION_PATCH 4
13
+ #define KREUZBERG_VERSION "4.8.4"
14
14
 
15
15
 
16
16
  #include <stdarg.h>
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-paddle-ocr"
3
- version = "4.8.3"
3
+ version = "4.8.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-pdfium-render"
3
- version = "4.8.3"
3
+ version = "4.8.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.8.3"
3
+ version = "4.8.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.8.3
4
+ version: 4.8.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-12 00:00:00.000000000 Z
11
+ date: 2026-04-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys