kreuzberg 4.7.2 → 4.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c713d0a652ed4e1752ce0c3f6e044516e870349de149b2fdd5cb6c429c722d65
4
- data.tar.gz: b519e5b70800b61bd6e5d6acef3a0fee0de8d53f0ad625e863753573f79caf4f
3
+ metadata.gz: 0f26fc086a0221056b94cd10141832240ef9563783835ef66555445e0d33442d
4
+ data.tar.gz: ed8488fcdd8bd12266ec2be8984ec5f5b60a32aaa562e64b442d9a4a641c8841
5
5
  SHA512:
6
- metadata.gz: 0cec6f964c7975e905997422f296a129fe32974c221300f1d6f89d5f54b27ca48e7343522e961d044cc90d9c7f0f4ef20c42de0c95b20e6794614cd1243bd03b
7
- data.tar.gz: 8b173be0cb820ade74c4e572465a28257911e9cd1d4ed510f93d26e65529c47f1e637f59a215171b8f931f4a45037bb9b9e5069fd4542b4f911fa36908c65364
6
+ metadata.gz: 485460f24ddbf58b82a41873a61c5d78408012b415e62a52daeed43df2d150b317115f3ca79bc029a9394188da70d6733f79ba8e61f589c58bee7d1a845b17d8
7
+ data.tar.gz: 5e80e6e34fe8125a934b141ad3ab058240b81f4b82d609e0e2aea822131cbd4954dd7112e152a90542fc56d494519182c62cc14b387f658944ebbc73bc5037a1
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.3" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1565,9 +1565,9 @@ dependencies = [
1565
1565
 
1566
1566
  [[package]]
1567
1567
  name = "fastrand"
1568
- version = "2.3.0"
1568
+ version = "2.4.0"
1569
1569
  source = "registry+https://github.com/rust-lang/crates.io-index"
1570
- checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
1570
+ checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f"
1571
1571
 
1572
1572
  [[package]]
1573
1573
  name = "fax"
@@ -2938,7 +2938,7 @@ dependencies = [
2938
2938
 
2939
2939
  [[package]]
2940
2940
  name = "kreuzberg-rb"
2941
- version = "4.7.1"
2941
+ version = "4.7.2"
2942
2942
  dependencies = [
2943
2943
  "async-trait",
2944
2944
  "html-to-markdown-rs",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.7.2"
3
+ version = "4.7.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.7.2'
4
+ VERSION = '4.7.3'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.7.2"
5
+ version = "4.7.3"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.7.2", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.2" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.7.3", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.3" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.184"
39
39
  log = "0.4"
@@ -43,7 +43,7 @@ num_cpus = "1.17.0"
43
43
  once_cell = "1.21.4"
44
44
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
45
45
  parking_lot = "0.12.5"
46
- pdf_oxide = { version = "0.3.19", default-features = false }
46
+ pdf_oxide = { version = "0.3.20", default-features = false }
47
47
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
48
48
  rayon = "1.11.0"
49
49
  reqwest = { version = "0.13.2", default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.7.2"
3
+ version = "4.7.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -307,7 +307,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
307
307
  outlook-pst = { version = "1.2.0", optional = true }
308
308
  parking_lot = "0.12.5"
309
309
  pastey = "0.2"
310
- pdf_oxide = { version = "0.3.19", default-features = false, optional = true }
310
+ pdf_oxide = { version = "0.3.20", default-features = false, optional = true }
311
311
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
312
312
  pulldown-cmark = { version = "0.13" }
313
313
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
@@ -404,4 +404,4 @@ tempfile = "3.27.0"
404
404
  tokio = { version = "1.51.0", features = ["macros", "time"] }
405
405
  tokio-test = "0.4"
406
406
  tracing-subscriber = { version = "0.3", features = ["env-filter"] }
407
- zip = { version = ">=7.0.0, <7.4.0", default-features = false, features = ["deflate-flate2"] }
407
+ zip = { version = ">=7.0.0, <8.6.0", default-features = false, features = ["deflate-flate2"] }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.7.2 Release**
21
+ > **🚀 Version 4.7.3 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -593,11 +593,21 @@ impl PdfExtractor {
593
593
  );
594
594
  (native_text, false)
595
595
  } else if decision.fallback || has_font_encoding_issues {
596
- let (ocr_text, ocr_tbls, ocr_elems, ocr_doc) = run_ocr_with_layout(content, config, path).await?;
597
- ocr_tables = ocr_tbls;
598
- _ocr_elements_from_ocr = ocr_elems;
599
- ocr_internal_doc = ocr_doc;
600
- (ocr_text, true)
596
+ match run_ocr_with_layout(content, config, path).await {
597
+ Ok((ocr_text, ocr_tbls, ocr_elems, ocr_doc)) => {
598
+ ocr_tables = ocr_tbls;
599
+ _ocr_elements_from_ocr = ocr_elems;
600
+ ocr_internal_doc = ocr_doc;
601
+ (ocr_text, true)
602
+ }
603
+ Err(e) => {
604
+ tracing::warn!(
605
+ error = %e,
606
+ "OCR fallback failed; using native text extraction result"
607
+ );
608
+ (native_text, false)
609
+ }
610
+ }
601
611
  } else {
602
612
  (native_text, false)
603
613
  }
@@ -197,13 +197,20 @@ impl OcrBackend for TesseractBackend {
197
197
  let processor = Arc::clone(&self.processor);
198
198
  let image_bytes = image_bytes.to_vec();
199
199
 
200
- let ocr_result = tokio::task::spawn_blocking(move || match output_format {
201
- Some(fmt) => processor.process_image_with_format(&image_bytes, &tess_config_clone, fmt),
202
- None => processor.process_image(&image_bytes, &tess_config_clone),
200
+ let ocr_result = tokio::task::spawn_blocking(move || {
201
+ std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| match output_format {
202
+ Some(fmt) => processor.process_image_with_format(&image_bytes, &tess_config_clone, fmt),
203
+ None => processor.process_image(&image_bytes, &tess_config_clone),
204
+ }))
205
+ .unwrap_or_else(|_| {
206
+ Err(crate::ocr::error::OcrError::ProcessingFailed(
207
+ "Tesseract/Leptonica foreign exception caught".to_string(),
208
+ ))
209
+ })
203
210
  })
204
211
  .await
205
212
  .map_err(|e| crate::KreuzbergError::Plugin {
206
- message: format!("Tesseract task panicked: {}", e),
213
+ message: format!("Tesseract task panicked or caught foreign exception: {}", e),
207
214
  plugin_name: "tesseract".to_string(),
208
215
  })?
209
216
  .map_err(|e| crate::KreuzbergError::Ocr {
@@ -302,13 +309,20 @@ impl OcrBackend for TesseractBackend {
302
309
  let processor = Arc::clone(&self.processor);
303
310
  let path_str = path.to_string_lossy().to_string();
304
311
 
305
- let ocr_result = tokio::task::spawn_blocking(move || match output_format {
306
- Some(fmt) => processor.process_image_file_with_format(&path_str, &tess_config_clone, fmt),
307
- None => processor.process_image_file(&path_str, &tess_config_clone),
312
+ let ocr_result = tokio::task::spawn_blocking(move || {
313
+ std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| match output_format {
314
+ Some(fmt) => processor.process_image_file_with_format(&path_str, &tess_config_clone, fmt),
315
+ None => processor.process_image_file(&path_str, &tess_config_clone),
316
+ }))
317
+ .unwrap_or_else(|_| {
318
+ Err(crate::ocr::error::OcrError::ProcessingFailed(
319
+ "Tesseract/Leptonica foreign exception caught".to_string(),
320
+ ))
321
+ })
308
322
  })
309
323
  .await
310
324
  .map_err(|e| crate::KreuzbergError::Plugin {
311
- message: format!("Tesseract task panicked: {}", e),
325
+ message: format!("Tesseract task panicked or caught foreign exception: {}", e),
312
326
  plugin_name: "tesseract".to_string(),
313
327
  })?
314
328
  .map_err(|e| crate::KreuzbergError::Ocr {
@@ -83,6 +83,7 @@ fn map_content_role(role: &ContentRole) -> (SemanticRole, Option<String>) {
83
83
  /// The resulting paragraphs feed into `apply_layout_overrides` and
84
84
  /// `assemble_internal_document`, matching the pdfium native text pipeline.
85
85
  #[cfg(feature = "ocr")]
86
+ #[allow(dead_code)] // Called from extractors/pdf/ocr.rs only when layout-detection is also enabled
86
87
  pub(crate) fn ocr_doc_to_paragraphs(
87
88
  doc: &crate::types::internal::InternalDocument,
88
89
  page_height_px: u32,
@@ -211,6 +211,15 @@ fn finalize_paragraph(
211
211
 
212
212
  // Join line texts with newlines (preserving full_text content exactly).
213
213
  let text: String = lines.iter().map(|l| l.text.as_str()).collect::<Vec<_>>().join("\n");
214
+
215
+ // Convert embedded HTML to markdown if detected (e.g., PDFs with HTML in text layer).
216
+ #[cfg(feature = "html")]
217
+ let text = if crate::pdf::text::contains_html_markup(&text) {
218
+ crate::pdf::text::convert_html_page_text(&text)
219
+ } else {
220
+ text
221
+ };
222
+
214
223
  let trimmed = text.trim();
215
224
  if trimmed.is_empty() {
216
225
  return None;
@@ -546,6 +555,13 @@ struct CharFontInfo {
546
555
  fn extract_page_blocks(page: &PdfPage) -> Option<(Vec<SegmentData>, String, Vec<f32>)> {
547
556
  let text_api = page.text().ok()?;
548
557
  let full_text = text_api.all();
558
+ // Convert embedded HTML to markdown if detected (PDFs with HTML in text layer).
559
+ #[cfg(feature = "html")]
560
+ let full_text = if crate::pdf::text::contains_html_markup(&full_text) {
561
+ crate::pdf::text::convert_html_page_text(&full_text)
562
+ } else {
563
+ full_text
564
+ };
549
565
  if full_text.trim().is_empty() {
550
566
  return None;
551
567
  }
@@ -53,7 +53,7 @@ fn fix_pdf_control_chars(text: &str) -> Cow<'_, str> {
53
53
  /// Some PDFs embed raw HTML in their text layer (e.g. from web-to-PDF converters).
54
54
  /// This function detects common HTML tags to determine if the text should be
55
55
  /// converted from HTML to markdown rather than used as-is.
56
- fn contains_html_markup(text: &str) -> bool {
56
+ pub(crate) fn contains_html_markup(text: &str) -> bool {
57
57
  if !text.contains('<') {
58
58
  return false;
59
59
  }
@@ -72,7 +72,7 @@ fn contains_html_markup(text: &str) -> bool {
72
72
  /// Falls back to the original text if the `html` feature is not enabled
73
73
  /// or if conversion fails.
74
74
  #[cfg(feature = "html")]
75
- fn convert_html_page_text(text: &str) -> String {
75
+ pub(crate) fn convert_html_page_text(text: &str) -> String {
76
76
  match crate::extraction::html::convert_html_to_markdown(text, None, None) {
77
77
  Ok(converted) => converted,
78
78
  Err(_) => text.to_owned(),
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-ffi"
3
- version = "4.7.2"
3
+ version = "4.7.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -41,7 +41,7 @@ serde_json = { version = "1.0.149" }
41
41
  tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
42
42
 
43
43
  [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
44
- kreuzberg = { path = "../kreuzberg", version = "4.7.2", default-features = false, features = [
44
+ kreuzberg = { path = "../kreuzberg", version = "4.7.3", default-features = false, features = [
45
45
  "pdf",
46
46
  "excel",
47
47
  "office",
@@ -64,7 +64,7 @@ kreuzberg = { path = "../kreuzberg", version = "4.7.2", default-features = false
64
64
  ] }
65
65
 
66
66
  [target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
67
- kreuzberg = { path = "../kreuzberg", version = "4.7.2", default-features = false, features = ["bundled-pdfium", "full"] }
67
+ kreuzberg = { path = "../kreuzberg", version = "4.7.3", default-features = false, features = ["bundled-pdfium", "full"] }
68
68
 
69
69
  [build-dependencies]
70
70
  cbindgen = "0.29"
@@ -9,8 +9,8 @@
9
9
 
10
10
  #define KREUZBERG_VERSION_MAJOR 4
11
11
  #define KREUZBERG_VERSION_MINOR 7
12
- #define KREUZBERG_VERSION_PATCH 2
13
- #define KREUZBERG_VERSION "4.7.2"
12
+ #define KREUZBERG_VERSION_PATCH 3
13
+ #define KREUZBERG_VERSION "4.7.3"
14
14
 
15
15
 
16
16
  #include <stdarg.h>
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-paddle-ocr"
3
- version = "4.7.2"
3
+ version = "4.7.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-pdfium-render"
3
- version = "4.7.2"
3
+ version = "4.7.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.7.2"
3
+ version = "4.7.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -2160,7 +2160,7 @@ impl Clone for TesseractAPI {
2160
2160
  }
2161
2161
 
2162
2162
  #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
2163
- unsafe extern "C" {
2163
+ unsafe extern "C-unwind" {
2164
2164
  fn TessBaseAPIMeanTextConf(handle: *mut c_void) -> c_int;
2165
2165
  fn TessBaseAPISetVariable(handle: *mut c_void, name: *const c_char, value: *const c_char) -> c_int;
2166
2166
  fn TessBaseAPIGetStringVariable(handle: *mut c_void, name: *const c_char) -> *const c_char;
@@ -69,7 +69,7 @@ impl Drop for ChoiceIterator {
69
69
  }
70
70
  }
71
71
 
72
- unsafe extern "C" {
72
+ unsafe extern "C-unwind" {
73
73
  fn TessChoiceIteratorDelete(handle: *mut c_void);
74
74
  fn TessChoiceIteratorNext(handle: *mut c_void) -> c_int;
75
75
  fn TessChoiceIteratorGetUTF8Text(handle: *mut c_void) -> *mut c_char;
@@ -29,7 +29,7 @@ use std::ffi::c_void;
29
29
  // ---------------------------------------------------------------------------
30
30
 
31
31
  #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
32
- unsafe extern "C" {
32
+ unsafe extern "C-unwind" {
33
33
  /// Allocates a new Pix with the given dimensions and bit depth.
34
34
  fn pixCreate(width: i32, height: i32, depth: i32) -> *mut c_void;
35
35
 
@@ -60,7 +60,7 @@ impl Drop for TessMonitor {
60
60
  }
61
61
  }
62
62
 
63
- unsafe extern "C" {
63
+ unsafe extern "C-unwind" {
64
64
  pub fn TessMonitorCreate() -> *mut c_void;
65
65
  pub fn TessMonitorDelete(monitor: *mut c_void);
66
66
  pub fn TessMonitorSetDeadlineMSecs(monitor: *mut c_void, deadline: c_int);
@@ -191,7 +191,7 @@ impl Drop for MutableIterator {
191
191
  }
192
192
  }
193
193
 
194
- unsafe extern "C" {
194
+ unsafe extern "C-unwind" {
195
195
  pub fn TessResultIteratorDelete(handle: *mut c_void);
196
196
  pub fn TessDeleteText(text: *mut c_char);
197
197
  }
@@ -380,7 +380,7 @@ impl Drop for PageIterator {
380
380
  }
381
381
  }
382
382
 
383
- unsafe extern "C" {
383
+ unsafe extern "C-unwind" {
384
384
  pub fn TessPageIteratorDelete(handle: *mut c_void);
385
385
  pub fn TessPageIteratorBegin(handle: *mut c_void);
386
386
  pub fn TessPageIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
@@ -555,7 +555,7 @@ impl Drop for ResultIterator {
555
555
  }
556
556
 
557
557
  #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
558
- unsafe extern "C" {
558
+ unsafe extern "C-unwind" {
559
559
  pub fn TessResultIteratorDelete(handle: *mut c_void);
560
560
  pub fn TessPageIteratorBegin(handle: *mut c_void);
561
561
  pub fn TessResultIteratorGetUTF8Text(handle: *mut c_void, level: c_int) -> *mut c_char;
@@ -198,7 +198,7 @@ impl Drop for TessResultRenderer {
198
198
  }
199
199
  }
200
200
 
201
- unsafe extern "C" {
201
+ unsafe extern "C-unwind" {
202
202
  pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
203
203
  pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
204
204
  pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.7.2
4
+ version: 4.7.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-04 00:00:00.000000000 Z
11
+ date: 2026-04-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys