kreuzberg 4.9.7 → 4.9.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 24151d9931038d170843ec2b42b0f34b65d78a6f07418683ba71a96c7b2f4f50
4
- data.tar.gz: 701aedb58613376393f8e168dbd2ecea211063172cbfdaae7af96590d990e8c1
3
+ metadata.gz: b634075425816167cdb132080aa31a6c5c561badbd154de17896866bdf88ba6e
4
+ data.tar.gz: 4a8d041bad2da842a676b2e2358ba44d3269323e820728bdb568ebc9171adee1
5
5
  SHA512:
6
- metadata.gz: b8917f84b8d8f7861c2f71a41248244344da6444942267e65b7feb04123017f099d618b6d72632d710936983f8ee9771e6847447e2119363d6e123caf8cc4f60
7
- data.tar.gz: 59fb7bd5a2c079fde4221aa1d578542cdf1b5f3637b28787d735d7a87b6fc86dd0f239e0623d4284c39d1175bed1072f4291a644c435d7242db9e5a3abeb5575
6
+ metadata.gz: 60c018a882054c23b629ee1d0692493c25c78a52e106fc7f25e3c22d7cdcfd36341f5aaeebea3d6be48d1425550914124daaeaeeb02bbd84bbb64cd6afedad59
7
+ data.tar.gz: 62d942f23dae32120184cd5b011275a2556b3e112a866128bd290a67ddbbd94c523ec2eaecd24589911264609ae612fd22eb51f3efc2e9eec4b1d08f8bd6823c
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.7" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.8" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.9.7"
3
+ version = "4.9.8"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -49,13 +49,13 @@ kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
49
49
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
50
50
  "rb-sys",
51
51
  ] }
52
- rb-sys = { version = "0.9.126", default-features = false, features = [
52
+ rb-sys = { version = "0.9.128", default-features = false, features = [
53
53
  "stable-api-compiled-fallback",
54
54
  ] }
55
55
  serde_json = "1.0.149"
56
56
  toml = "1.1.2"
57
57
  serde_yaml_ng = "0.10"
58
- tokio = { version = "1.52.1", features = [
58
+ tokio = { version = "1.52.3", features = [
59
59
  "rt",
60
60
  "rt-multi-thread",
61
61
  "macros",
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
65
65
  "time",
66
66
  "io-util",
67
67
  ] }
68
- html-to-markdown-rs = { version = "3.3.1", default-features = false }
68
+ html-to-markdown-rs = { version = "3.4.1", default-features = false }
69
69
 
70
70
  [dev-dependencies]
71
71
  pretty_assertions = "1.4"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.9.7'
4
+ VERSION = '4.9.8'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.9.7"
5
+ version = "4.9.8"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -23,20 +23,20 @@ clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
23
23
  comrak = { version = "0.52", default-features = false }
24
24
  console_error_panic_hook = "0.1"
25
25
  criterion = { version = "0.8", features = ["html_reports"] }
26
- ctor = "0.10"
26
+ ctor = "1.0"
27
27
  dbase = "0.7"
28
28
  futures = "0.3"
29
29
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
30
30
  hex = "0.4.3"
31
- html-to-markdown-rs = { version = "3.3.1", default-features = false }
31
+ html-to-markdown-rs = { version = "3.4.1", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.9.7", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.7" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.9.8", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.8" }
37
37
  lazy_static = "1.5.0"
38
- libc = "0.2.185"
39
- liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
38
+ libc = "0.2.186"
39
+ liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false }
40
40
  log = "0.4"
41
41
  lzma-rust2 = { version = "0.16.2" }
42
42
  memmap2 = "0.9"
@@ -45,19 +45,19 @@ num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.37", default-features = false }
48
+ pdf_oxide = { version = "0.3.49", default-features = false }
49
49
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
50
  rayon = "1.12.0"
51
- reqwest = { version = "0.13.2", default-features = false }
51
+ reqwest = { version = "0.13.3", default-features = false }
52
52
  serde = { version = "1.0.228", features = ["derive"] }
53
53
  serde_json = { version = "1.0.149" }
54
54
  serde_toon_format = "0.1"
55
55
  tempfile = "3.27.0"
56
56
  thiserror = "2.0.18"
57
- tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
57
+ tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
58
58
  toml = "1.1.2"
59
59
  tracing = "0.1"
60
- tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false }
60
+ tree-sitter-language-pack = { version = "1.8.1", features = ["serde"], default-features = false }
61
61
  wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
62
62
  wasm-bindgen-futures = "0.4"
63
63
  web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.9.7"
3
+ version = "4.9.8"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -254,7 +254,7 @@ biblib = { version = "0.4", default-features = false, features = [
254
254
  bitvec = "1.0"
255
255
  blake3 = "1"
256
256
  bytes = { version = "1", features = ["serde"] }
257
- calamine = { version = "0.34.0", features = ["dates"], optional = true }
257
+ calamine = { version = "0.35.0", features = ["dates"], optional = true }
258
258
  cfb = { version = "0.14", optional = true }
259
259
  chardetng = { version = "1.0.0", optional = true }
260
260
  chrono = { version = "0.4", optional = true }
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
271
271
  "simd",
272
272
  ], optional = true }
273
273
  hex = "0.4.3"
274
- html-to-markdown-rs = { version = "3.3.1", default-features = false, features = [
274
+ html-to-markdown-rs = { version = "3.4.1", default-features = false, features = [
275
275
  "inline-images",
276
276
  "metadata",
277
277
  ], optional = true }
@@ -291,11 +291,11 @@ jotdown = "0.10"
291
291
  kamadak-exif = { version = "0.6.1", optional = true }
292
292
 
293
293
  kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
294
- libc = "0.2.185"
295
- liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false, optional = true }
294
+ libc = "0.2.186"
295
+ liter-llm = { version = "1.3", features = ["native-http", "tracing"], default-features = false, optional = true }
296
296
  log = "0.4"
297
297
  lopdf = { version = "0.40.0", optional = true }
298
- mail-parser = { version = "0.11.2", optional = true }
298
+ mail-parser = { version = "0.11.3", optional = true }
299
299
  memchr = "2.8.0"
300
300
  memmap2 = "0.9"
301
301
  mime_guess = "2.0"
@@ -303,8 +303,8 @@ minijinja = { version = "2", optional = true }
303
303
  ndarray = { version = "0.17", optional = true }
304
304
  num_cpus = "1.17.0"
305
305
  once_cell = "1.21.4"
306
- opentelemetry = { version = "0.31", features = ["trace"], optional = true }
307
- opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"], optional = true }
306
+ opentelemetry = { version = "0.32", features = ["trace"], optional = true }
307
+ opentelemetry_sdk = { version = "0.32", features = ["rt-tokio"], optional = true }
308
308
  org = { version = "0.3", optional = true }
309
309
  ort = { version = "2.0.0-rc.12", default-features = false, features = [
310
310
  "std",
@@ -314,14 +314,14 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
314
  outlook-pst = { version = "1.2.0", optional = true }
315
315
  parking_lot = "0.12.5"
316
316
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.37", default-features = false, optional = true }
317
+ pdf_oxide = { version = "0.3.49", default-features = false, optional = true }
318
318
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
319
  pulldown-cmark = { version = "0.13" }
320
- quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
320
+ quick-xml = { version = "0.40.1", features = ["serialize"], optional = true }
321
321
  rake = { version = "0.3.6", optional = true }
322
322
  rayon = "1.12.0"
323
323
  regex = "1.12.3"
324
- rmcp = { version = "1.5.0", features = [
324
+ rmcp = { version = "1.7.0", features = [
325
325
  "server",
326
326
  "macros",
327
327
  "base64",
@@ -345,11 +345,14 @@ tempfile = { version = "3.27.0", optional = true }
345
345
  text-splitter = { version = "0.30.1", features = ["markdown"], optional = true }
346
346
  thiserror = "2.0.18"
347
347
  tiff = { version = "0.11", optional = true }
348
- tokenizers = { version = "0.22", optional = true, default-features = false, features = [
348
+ # Pinned to 0.22 text-splitter 0.30.1 ChunkSizer impl + embeddings/add_special_tokens
349
+ # break against tokenizers 0.23. Bump deliberately on the next minor with a coordinated
350
+ # text-splitter upgrade. Tracked under issue #991 / 4.9.8 release.
351
+ tokenizers = { version = "=0.22.2", optional = true, default-features = false, features = [
349
352
  "http",
350
353
  "fancy-regex",
351
354
  ] }
352
- tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
355
+ tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"], optional = true }
353
356
  toml = "1.1.2"
354
357
  tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
355
358
  tower-http = { version = "0.6", features = ["cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full"], optional = true }
@@ -357,8 +360,10 @@ tracing = "0.1"
357
360
  tracing-opentelemetry = { version = "0.32", optional = true }
358
361
  unicode-normalization = { version = "0.1.25", optional = true }
359
362
  urlencoding = "2"
360
- utoipa = { version = "5.4", features = ["axum_extras"], optional = true }
361
- v_htmlescape = { version = "0.15", optional = true }
363
+ utoipa = { version = "5.5", features = ["axum_extras"], optional = true }
364
+ # Pinned to 0.15 v_htmlescape 0.17 renamed `escape` fn to an `Escape` struct.
365
+ # Update call sites in src/rendering/html_styled.rs before bumping.
366
+ v_htmlescape = { version = "=0.15.8", optional = true }
362
367
  whatlang = { version = "0.18.0", optional = true }
363
368
  zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
364
369
  "deflate-flate2",
@@ -392,7 +397,7 @@ optional = true
392
397
  # Override getrandom to enable js feature for WASM targets
393
398
  # This is needed because ring/rustls (via ureq) depend on getrandom without js feature
394
399
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
395
- tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false, optional = true }
400
+ tree-sitter-language-pack = { version = "1.8.1", features = ["serde"], default-features = false, optional = true }
396
401
  wasm-bindgen-rayon = { version = "1.3", optional = true }
397
402
 
398
403
  [build-dependencies]
@@ -409,7 +414,7 @@ jsonschema = "0.46"
409
414
  serial_test = "3.4.0"
410
415
  tar = "0.4.45"
411
416
  tempfile = "3.27.0"
412
- tokio = { version = "1.52.1", features = ["macros", "time"] }
417
+ tokio = { version = "1.52.3", features = ["macros", "time"] }
413
418
  tokio-test = "0.4"
414
419
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
415
420
  zip = { version = ">=7.0.0, <8.6.0", default-features = false, features = ["deflate-flate2"] }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.9.7 Release**
21
+ > **🚀 Version 4.9.8 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -155,7 +155,6 @@ impl From<&TreeSitterProcessConfig> for tree_sitter_language_pack::ProcessConfig
155
155
  symbols: p.symbols,
156
156
  diagnostics: p.diagnostics,
157
157
  chunk_max_size: p.chunk_max_size,
158
- extractions: None,
159
158
  }
160
159
  }
161
160
  }
@@ -1,6 +1,8 @@
1
1
  //! Character encoding utilities for RTF parsing.
2
2
  //!
3
- //! Provides hex byte parsing and Windows-1252 character mapping for the 0x80-0x9F range.
3
+ //! Provides hex byte parsing and legacy Windows codepage decoding for RTF byte escapes.
4
+
5
+ use encoding_rs::Encoding;
4
6
 
5
7
  /// Convert a hex digit character to its numeric value.
6
8
  ///
@@ -69,6 +71,48 @@ pub fn decode_windows_1252(byte: u8) -> char {
69
71
  }
70
72
  }
71
73
 
74
+ /// Map a Windows codepage number to an `encoding_rs` encoding.
75
+ ///
76
+ /// Unknown values fall back to Windows-1252, the RTF default ANSI codepage.
77
+ #[inline]
78
+ pub(crate) fn encoding_for_windows_codepage(codepage: u32) -> &'static Encoding {
79
+ let label: &[u8] = match codepage {
80
+ 65001 => b"utf-8",
81
+ 20127 => b"us-ascii",
82
+ 1250 => b"windows-1250",
83
+ 1251 => b"windows-1251",
84
+ 1252 => b"windows-1252",
85
+ 1253 => b"windows-1253",
86
+ 1254 => b"windows-1254",
87
+ 1255 => b"windows-1255",
88
+ 1256 => b"windows-1256",
89
+ 1257 => b"windows-1257",
90
+ 1258 => b"windows-1258",
91
+ 932 | 10001 => b"shift_jis",
92
+ 936 | 10008 => b"gbk",
93
+ 949 | 10003 => b"euc-kr",
94
+ 950 | 10002 => b"big5",
95
+ 28591 => b"iso-8859-1",
96
+ 28592 => b"iso-8859-2",
97
+ 28595 => b"iso-8859-5",
98
+ 28597 => b"iso-8859-7",
99
+ 28599 => b"iso-8859-9",
100
+ _ => b"windows-1252",
101
+ };
102
+ Encoding::for_label(label).unwrap_or(encoding_rs::WINDOWS_1252)
103
+ }
104
+
105
+ /// Decode RTF hex escape bytes using the active ANSI codepage.
106
+ #[inline]
107
+ pub(crate) fn decode_ansi_bytes(bytes: &[u8], codepage: u32) -> String {
108
+ if codepage == 1252 {
109
+ return bytes.iter().map(|&byte| decode_windows_1252(byte)).collect();
110
+ }
111
+
112
+ let (decoded, _, _) = encoding_for_windows_codepage(codepage).decode(bytes);
113
+ decoded.into_owned()
114
+ }
115
+
72
116
  /// Parse an RTF control word and extract its value.
73
117
  ///
74
118
  /// Returns a tuple of (control_word, optional_numeric_value).
@@ -486,6 +486,19 @@ mod tests {
486
486
  );
487
487
  }
488
488
 
489
+ #[test]
490
+ fn test_rtf_ansicpg1251_hex_escape_extraction() {
491
+ let rtf_content =
492
+ r#"{\rtf1\ansi\ansicpg1251\deff0{\fonttbl{\f0\fnil\fcharset204 Arial;}}\f0 \'cf\'f0\'e8\'e2\'e5\'f2}"#;
493
+ let (text, _, _, _, _) = extract_text_from_rtf(rtf_content, true);
494
+
495
+ assert!(text.contains("Привет"), "expected readable Cyrillic, got: {text:?}");
496
+ assert!(
497
+ !text.contains("Ïðèâåò"),
498
+ "should not decode CP1251 bytes as Windows-1252"
499
+ );
500
+ }
501
+
489
502
  #[tokio::test]
490
503
  async fn test_rtf_document_structure_with_annotations() {
491
504
  let rtf_content = r#"{\rtf1 Normal text\par {\b Bold paragraph}\par More normal text}"#;
@@ -1,6 +1,6 @@
1
1
  //! Core RTF parsing logic.
2
2
 
3
- use crate::extractors::rtf::encoding::{decode_windows_1252, parse_hex_byte, parse_rtf_control_word};
3
+ use crate::extractors::rtf::encoding::{decode_ansi_bytes, parse_hex_byte, parse_rtf_control_word};
4
4
  use crate::extractors::rtf::formatting::{map_offset, normalize_whitespace_with_mapping};
5
5
  use crate::extractors::rtf::images::{RtfImage, extract_pict_image};
6
6
  use crate::extractors::rtf::tables::TableState;
@@ -944,6 +944,9 @@ pub fn extract_text_from_rtf(
944
944
  // Unicode skip count (\ucN): how many replacement bytes follow \uN.
945
945
  // Scoped per group — push on '{', pop on '}'.
946
946
  let mut uc_stack: Vec<u8> = vec![1]; // default \uc1
947
+ // ANSI codepage for \'hh escapes. RTF defaults to Windows-1252 unless
948
+ // overridden by \ansicpgNNNN. Scoped like other document properties.
949
+ let mut ansi_codepage_stack: Vec<u32> = vec![1252];
947
950
 
948
951
  // Hyperlink field tracking for \field{\*\fldinst HYPERLINK "url"}{\fldrslt text}
949
952
  let mut in_fldinst = false;
@@ -1009,6 +1012,8 @@ pub fn extract_text_from_rtf(
1009
1012
  // Inherit current uc value into new group scope
1010
1013
  let current_uc = uc_stack.last().copied().unwrap_or(1);
1011
1014
  uc_stack.push(current_uc);
1015
+ let current_codepage = ansi_codepage_stack.last().copied().unwrap_or(1252);
1016
+ ansi_codepage_stack.push(current_codepage);
1012
1017
  // Inherit hidden state into new group scope
1013
1018
  let current_hidden = hidden_stack.last().copied().unwrap_or(false);
1014
1019
  hidden_stack.push(current_hidden);
@@ -1028,6 +1033,9 @@ pub fn extract_text_from_rtf(
1028
1033
  if uc_stack.len() > 1 {
1029
1034
  uc_stack.pop();
1030
1035
  }
1036
+ if ansi_codepage_stack.len() > 1 {
1037
+ ansi_codepage_stack.pop();
1038
+ }
1031
1039
  if hidden_stack.len() > 1 {
1032
1040
  hidden_stack.pop();
1033
1041
  }
@@ -1143,6 +1151,7 @@ pub fn extract_text_from_rtf(
1143
1151
  &mut para_metas,
1144
1152
  &mut para_meta_emitted,
1145
1153
  &mut uc_stack,
1154
+ &mut ansi_codepage_stack,
1146
1155
  &mut footnote_count,
1147
1156
  in_footnote,
1148
1157
  &mut footnote_buf,
@@ -1188,12 +1197,21 @@ pub fn extract_text_from_rtf(
1188
1197
  expect_destination = false;
1189
1198
  let hex1 = chars.next();
1190
1199
  let hex2 = chars.next();
1191
- // Capture hex-encoded chars in footnote buffer even when skipping
1192
- if in_footnote
1193
- && let (Some(h1), Some(h2)) = (hex1, hex2)
1200
+ let bytes = if let (Some(h1), Some(h2)) = (hex1, hex2)
1194
1201
  && let Some(byte) = parse_hex_byte(h1, h2)
1195
1202
  {
1196
- footnote_buf.push(decode_windows_1252(byte));
1203
+ let mut bytes = vec![byte];
1204
+ while let Some(next_bytes) = consume_adjacent_hex_escape(&mut chars) {
1205
+ bytes.push(next_bytes);
1206
+ }
1207
+ Some(bytes)
1208
+ } else {
1209
+ None
1210
+ };
1211
+
1212
+ if in_footnote && let Some(bytes) = bytes.as_deref() {
1213
+ let codepage = ansi_codepage_stack.last().copied().unwrap_or(1252);
1214
+ footnote_buf.push_str(&decode_ansi_bytes(bytes, codepage));
1197
1215
  }
1198
1216
  if skip_depth > 0 {
1199
1217
  continue;
@@ -1202,14 +1220,13 @@ pub fn extract_text_from_rtf(
1202
1220
  if hidden_stack.last().copied().unwrap_or(false) {
1203
1221
  continue;
1204
1222
  }
1205
- if let (Some(h1), Some(h2)) = (hex1, hex2)
1206
- && let Some(byte) = parse_hex_byte(h1, h2)
1207
- {
1208
- let decoded = decode_windows_1252(byte);
1223
+ if let Some(bytes) = bytes.as_deref() {
1224
+ let codepage = ansi_codepage_stack.last().copied().unwrap_or(1252);
1225
+ let decoded = decode_ansi_bytes(bytes, codepage);
1209
1226
  if let Some(state) = table_state.as_mut()
1210
1227
  && state.in_row
1211
1228
  {
1212
- state.current_cell.push(decoded);
1229
+ state.current_cell.push_str(&decoded);
1213
1230
  } else {
1214
1231
  // Flush deferred boundary space
1215
1232
  if pending_boundary_space
@@ -1221,7 +1238,7 @@ pub fn extract_text_from_rtf(
1221
1238
  }
1222
1239
  pending_boundary_space = false;
1223
1240
  para_meta_emitted = false;
1224
- result.push(decoded);
1241
+ result.push_str(&decoded);
1225
1242
  if let Some(flag) = group_has_text.last_mut() {
1226
1243
  *flag = true;
1227
1244
  }
@@ -1331,6 +1348,13 @@ pub fn extract_text_from_rtf(
1331
1348
  {
1332
1349
  *uc = val.max(0) as u8;
1333
1350
  }
1351
+ if control_word == "ansicpg"
1352
+ && let Some(val) = _param
1353
+ && val > 0
1354
+ && let Some(codepage) = ansi_codepage_stack.last_mut()
1355
+ {
1356
+ *codepage = val as u32;
1357
+ }
1334
1358
  // Capture unicode chars inside footnote buffers
1335
1359
  if in_footnote
1336
1360
  && control_word == "u"
@@ -1382,6 +1406,7 @@ pub fn extract_text_from_rtf(
1382
1406
  &mut para_metas,
1383
1407
  &mut para_meta_emitted,
1384
1408
  &mut uc_stack,
1409
+ &mut ansi_codepage_stack,
1385
1410
  &mut footnote_count,
1386
1411
  in_footnote,
1387
1412
  &mut footnote_buf,
@@ -1528,6 +1553,23 @@ pub fn extract_text_from_rtf(
1528
1553
  (final_result, tables, images, para_metas, formatting_data)
1529
1554
  }
1530
1555
 
1556
+ fn consume_adjacent_hex_escape(chars: &mut std::iter::Peekable<std::str::Chars>) -> Option<u8> {
1557
+ let mut lookahead = chars.clone();
1558
+ if lookahead.next()? != '\\' || lookahead.next()? != '\'' {
1559
+ return None;
1560
+ }
1561
+ let h1 = lookahead.next()?;
1562
+ let h2 = lookahead.next()?;
1563
+ let byte = parse_hex_byte(h1, h2)?;
1564
+
1565
+ chars.next();
1566
+ chars.next();
1567
+ chars.next();
1568
+ chars.next();
1569
+
1570
+ Some(byte)
1571
+ }
1572
+
1531
1573
  /// Handle an RTF control word during parsing.
1532
1574
  #[allow(clippy::too_many_arguments, clippy::ptr_arg)]
1533
1575
  fn handle_control_word(
@@ -1549,6 +1591,7 @@ fn handle_control_word(
1549
1591
  para_metas: &mut Vec<ParagraphMeta>,
1550
1592
  para_meta_emitted: &mut bool,
1551
1593
  uc_stack: &mut Vec<u8>,
1594
+ ansi_codepage_stack: &mut [u32],
1552
1595
  footnote_count: &mut usize,
1553
1596
  _in_footnote: bool,
1554
1597
  _footnote_buf: &mut String,
@@ -1617,6 +1660,14 @@ fn handle_control_word(
1617
1660
  *uc = val.max(0) as u8;
1618
1661
  }
1619
1662
  }
1663
+ "ansicpg" => {
1664
+ if let Some(val) = param
1665
+ && val > 0
1666
+ && let Some(codepage) = ansi_codepage_stack.last_mut()
1667
+ {
1668
+ *codepage = val as u32;
1669
+ }
1670
+ }
1620
1671
  // Unicode escape: \u1234 (signed integer)
1621
1672
  "u" => {
1622
1673
  if let Some(code_num) = param {
@@ -92,23 +92,28 @@ pub async fn extract_structured(
92
92
  let sanitized_schema = sanitize_schema_for_provider(&config.schema, &config.llm.model);
93
93
 
94
94
  // Build chat request with JSON schema response format.
95
- // Use field assignment because `stream` is pub(crate) in liter-llm.
96
- let mut request = liter_llm::ChatCompletionRequest::default();
97
- request.model = config.llm.model.clone();
98
- request.messages = vec![liter_llm::Message::User(liter_llm::UserMessage {
99
- content: liter_llm::UserContent::Text(prompt),
100
- name: None,
101
- })];
102
- request.temperature = config.llm.temperature;
103
- request.max_tokens = config.llm.max_tokens;
104
- request.response_format = Some(liter_llm::ResponseFormat::JsonSchema {
105
- json_schema: liter_llm::JsonSchemaFormat {
106
- name: config.schema_name.clone(),
107
- description: config.schema_description.clone(),
108
- schema: sanitized_schema,
109
- strict: Some(config.strict),
110
- },
111
- });
95
+ // Use field assignment because `stream` is pub(crate) in liter-llm; struct-init
96
+ // syntax with `..Default::default()` won't compile across the crate boundary.
97
+ #[allow(clippy::field_reassign_with_default)]
98
+ let request = {
99
+ let mut req = liter_llm::ChatCompletionRequest::default();
100
+ req.model = config.llm.model.clone();
101
+ req.messages = vec![liter_llm::Message::User(liter_llm::UserMessage {
102
+ content: liter_llm::UserContent::Text(prompt),
103
+ name: None,
104
+ })];
105
+ req.temperature = config.llm.temperature;
106
+ req.max_tokens = config.llm.max_tokens;
107
+ req.response_format = Some(liter_llm::ResponseFormat::JsonSchema {
108
+ json_schema: liter_llm::JsonSchemaFormat {
109
+ name: config.schema_name.clone(),
110
+ description: config.schema_description.clone(),
111
+ schema: sanitized_schema,
112
+ strict: Some(config.strict),
113
+ },
114
+ });
115
+ req
116
+ };
112
117
 
113
118
  let response = client
114
119
  .chat(request)
@@ -136,12 +136,17 @@ pub async fn vlm_ocr(
136
136
  name: None,
137
137
  });
138
138
 
139
- // Use mutable default because `stream` is pub(crate) in liter-llm.
140
- let mut request = ChatCompletionRequest::default();
141
- request.model = config.model.clone();
142
- request.messages = vec![message];
143
- request.temperature = config.temperature;
144
- request.max_tokens = config.max_tokens;
139
+ // Use mutable default because `stream` is pub(crate) in liter-llm; struct-init
140
+ // syntax with `..Default::default()` won't compile across the crate boundary.
141
+ #[allow(clippy::field_reassign_with_default)]
142
+ let request = {
143
+ let mut req = ChatCompletionRequest::default();
144
+ req.model = config.model.clone();
145
+ req.messages = vec![message];
146
+ req.temperature = config.temperature;
147
+ req.max_tokens = config.max_tokens;
148
+ req
149
+ };
145
150
 
146
151
  let response = client.chat(request).await.map_err(|e| {
147
152
  crate::KreuzbergError::ocr(format!(
@@ -36,6 +36,7 @@ pub(crate) fn current_pdf_path() -> Option<PathBuf> {
36
36
  /// Returns segments per page (indexed by page number, 0-based).
37
37
  /// Returns `None` if pdf_oxide fails to open or extract the document.
38
38
  #[cfg(feature = "pdf")]
39
+ #[allow(unused_mut)] // pdf is mutated under feature-gated paths only
39
40
  pub(crate) fn extract_segments_with_oxide(page_count: usize) -> Option<Vec<Vec<SegmentData>>> {
40
41
  let file_path = match current_pdf_path() {
41
42
  Some(p) => {
@@ -98,6 +98,33 @@ async fn test_rtf_accent_extraction() {
98
98
  );
99
99
  }
100
100
 
101
+ /// Test extraction of RTF file with CP1251 hex byte escapes.
102
+ ///
103
+ /// File: ansicpg1251.rtf
104
+ /// Content: Cyrillic text encoded as `\'hh` bytes with `\ansicpg1251`
105
+ /// Expected: Decodes byte escapes with the declared ANSI codepage
106
+ #[tokio::test]
107
+ async fn test_rtf_ansicpg1251_extraction() {
108
+ let config = ExtractionConfig::default();
109
+ let path = get_rtf_path("ansicpg1251.rtf");
110
+
111
+ let result = extract_file(&path, Some("application/rtf"), &config).await;
112
+
113
+ assert!(result.is_ok(), "RTF extraction should succeed for ansicpg1251.rtf");
114
+ let extraction = result.expect("Operation failed");
115
+
116
+ assert_eq!(extraction.mime_type, "application/rtf");
117
+ assert!(
118
+ extraction.content.contains("Привет, мир!"),
119
+ "Should decode CP1251 hex escapes as Cyrillic text (found: {})",
120
+ extraction.content
121
+ );
122
+ assert!(
123
+ !extraction.content.contains("Ïðèâåò"),
124
+ "Should not decode CP1251 bytes as Windows-1252 mojibake"
125
+ );
126
+ }
127
+
101
128
  /// Test extraction of RTF file with bookmarks (internal anchors/references).
102
129
  ///
103
130
  /// File: bookmark.rtf
@@ -531,6 +558,7 @@ async fn test_rtf_no_critical_content_loss() {
531
558
 
532
559
  let must_extract = vec![
533
560
  "unicode.rtf",
561
+ "ansicpg1251.rtf",
534
562
  "accent.rtf",
535
563
  "heading.rtf",
536
564
  "list_simple.rtf",
@@ -574,7 +602,13 @@ async fn test_rtf_no_critical_content_loss() {
574
602
  async fn test_rtf_mime_type_preservation() {
575
603
  let config = ExtractionConfig::default();
576
604
 
577
- let test_files = vec!["unicode.rtf", "accent.rtf", "heading.rtf", "list_simple.rtf"];
605
+ let test_files = vec![
606
+ "unicode.rtf",
607
+ "ansicpg1251.rtf",
608
+ "accent.rtf",
609
+ "heading.rtf",
610
+ "list_simple.rtf",
611
+ ];
578
612
 
579
613
  for filename in test_files {
580
614
  let path = get_rtf_path(filename);
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-ffi"
3
- version = "4.9.7"
3
+ version = "4.9.8"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -28,14 +28,14 @@ tree-sitter = ["kreuzberg/tree-sitter"]
28
28
  [dependencies]
29
29
  ahash = { version = "0.8.12", features = ["serde"] }
30
30
  async-trait = "0.1.89"
31
- ctor = "0.10"
32
- html-to-markdown-rs = { version = "3.3.1", default-features = false }
33
- kreuzberg = { path = "../kreuzberg", version = "4.9.7", default-features = false, features = ["bundled-pdfium", "full"] }
31
+ ctor = "1.0"
32
+ html-to-markdown-rs = { version = "3.4.1", default-features = false }
33
+ kreuzberg = { path = "../kreuzberg", version = "4.9.8", default-features = false, features = ["bundled-pdfium", "full"] }
34
34
  log = "0.4"
35
35
  rayon = { version = "1.12.0", optional = true }
36
36
  serde = { version = "1.0.228", features = ["derive"] }
37
37
  serde_json = { version = "1.0.149" }
38
- tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
38
+ tokio = { version = "1.52.3", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
39
39
 
40
40
  [build-dependencies]
41
41
  cbindgen = "0.29"
@@ -9,8 +9,8 @@
9
9
 
10
10
  #define KREUZBERG_VERSION_MAJOR 4
11
11
  #define KREUZBERG_VERSION_MINOR 9
12
- #define KREUZBERG_VERSION_PATCH 7
13
- #define KREUZBERG_VERSION "4.9.7"
12
+ #define KREUZBERG_VERSION_PATCH 8
13
+ #define KREUZBERG_VERSION "4.9.8"
14
14
 
15
15
 
16
16
  #include <stdarg.h>
@@ -108,7 +108,7 @@ pub use types::*;
108
108
  pub use util::{kreuzberg_last_error, kreuzberg_last_error_code, kreuzberg_last_panic_context, kreuzberg_version};
109
109
  pub use validation::*;
110
110
 
111
- #[ctor::ctor]
111
+ #[ctor::ctor(unsafe)]
112
112
  fn setup_onnx_runtime_path() {
113
113
  kreuzberg::ort_discovery::ensure_ort_available();
114
114
  }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-paddle-ocr"
3
- version = "4.9.7"
3
+ version = "4.9.8"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-pdfium-render"
3
- version = "4.9.7"
3
+ version = "4.9.8"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.9.7"
3
+ version = "4.9.8"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -38,21 +38,21 @@ dynamic-linking = []
38
38
  thiserror = "2.0.18"
39
39
 
40
40
  [build-dependencies]
41
- cc = { version = "^1.2.60", optional = true }
41
+ cc = { version = "^1.2.62", optional = true }
42
42
  cmake = { version = "0.1.58", optional = true }
43
43
  zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
44
44
  "deflate-flate2-zlib-rs",
45
45
  ] }
46
46
 
47
47
  [target.'cfg(not(target_os = "windows"))'.build-dependencies]
48
- reqwest = { version = "0.13.2", default-features = false, features = [
48
+ reqwest = { version = "0.13.3", default-features = false, features = [
49
49
  "blocking",
50
50
  "rustls",
51
51
  ], optional = true }
52
52
 
53
53
  # Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
54
54
  [target.'cfg(target_os = "windows")'.build-dependencies]
55
- reqwest = { version = "0.13.2", default-features = false, features = [
55
+ reqwest = { version = "0.13.3", default-features = false, features = [
56
56
  "blocking",
57
57
  "native-tls",
58
58
  ], optional = true }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.9.7
4
+ version: 4.9.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-08 00:00:00.000000000 Z
11
+ date: 2026-05-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys