kreuzberg 4.9.0 → 4.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bb7b77bae36a5da34ce209fbf1ea7c0a68aef4b22f8b373b908f9c113f404ef5
4
- data.tar.gz: a6c8667aee6ae2c9e11d45fc98fcb355561fec6e4a7d51d852664bd6367af8cc
3
+ metadata.gz: 559b3104e6e21f2f14a92949d427703b51ea8b35b7a643d8964b6953785aa6e1
4
+ data.tar.gz: bfce92579c45ecba0da0d8e1f077ecca0bb9dd6a1e96c950e8beb8d0a39b5884
5
5
  SHA512:
6
- metadata.gz: 7569a4914ab4a4d440a0c74e622a9f26f7189b62bc9c2d05fc5e857a32c8fabde8eb854edef34e94bd95d5357e44137c1573e7ce68db45ed85c26dbe31e6972b
7
- data.tar.gz: 9741106549d7bf79cc1ae34a07f686cca1bf6a4c19fcb01b40cd8f1372166c8e9c3a0321e1e26e416ebb98d413ee1c9093d247949292a7c07a85594ea1df508e
6
+ metadata.gz: 1ea8af57d65eb5008126758041df2bfe07acca9d47ebfbf9c9de79f12b5d5ff2336d55b643268d1ea420db1825eaf9bfef6e5deb7335bd2449e9ccb62800492d
7
+ data.tar.gz: '019f2abaa7dcaf2b91925f7ed0b5332ce569a41a5ae1f152d698423cce1276ea68e177c2088b4377c369943e235c70584f61a76da5600d6dd8b3bd075bc266ab'
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.1" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -2916,7 +2916,7 @@ dependencies = [
2916
2916
 
2917
2917
  [[package]]
2918
2918
  name = "kreuzberg-rb"
2919
- version = "4.8.6"
2919
+ version = "4.9.1"
2920
2920
  dependencies = [
2921
2921
  "async-trait",
2922
2922
  "html-to-markdown-rs",
@@ -5734,9 +5734,9 @@ checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
5734
5734
 
5735
5735
  [[package]]
5736
5736
  name = "typenum"
5737
- version = "1.19.0"
5737
+ version = "1.20.0"
5738
5738
  source = "registry+https://github.com/rust-lang/crates.io-index"
5739
- checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
5739
+ checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
5740
5740
 
5741
5741
  [[package]]
5742
5742
  name = "unicase"
@@ -6186,9 +6186,9 @@ dependencies = [
6186
6186
 
6187
6187
  [[package]]
6188
6188
  name = "web_atoms"
6189
- version = "0.2.3"
6189
+ version = "0.2.4"
6190
6190
  source = "registry+https://github.com/rust-lang/crates.io-index"
6191
- checksum = "57a9779e9f04d2ac1ce317aee707aa2f6b773afba7b931222bff6983843b1576"
6191
+ checksum = "d7cff6eef815df1834fd250e3a2ff436044d82a9f1bc1980ca1dbdf07effc538"
6192
6192
  dependencies = [
6193
6193
  "phf",
6194
6194
  "phf_codegen",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.9.0"
3
+ version = "4.9.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.9.0'
4
+ VERSION = '4.9.1'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.9.0"
5
+ version = "4.9.1"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.2.5", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.9.0", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.0" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.9.1", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.1" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.185"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.33", default-features = false }
48
+ pdf_oxide = { version = "0.3.34", default-features = false }
49
49
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
50
  rayon = "1.12.0"
51
51
  reqwest = { version = "0.13.2", default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.9.0"
3
+ version = "4.9.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
314
  outlook-pst = { version = "1.2.0", optional = true }
315
315
  parking_lot = "0.12.5"
316
316
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.33", default-features = false, optional = true }
317
+ pdf_oxide = { version = "0.3.34", default-features = false, optional = true }
318
318
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
319
  pulldown-cmark = { version = "0.13" }
320
320
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.9.0 Release**
21
+ > **🚀 Version 4.9.1 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -284,7 +284,7 @@ fn extract_pptx_from_container<R: std::io::Read + std::io::Seek>(
284
284
 
285
285
  let document = doc_builder
286
286
  .map(|b| b.build())
287
- .and_then(|d| if d.is_empty() { None } else { Some(d) });
287
+ .filter(|d| !d.is_empty());
288
288
 
289
289
  Ok(PptxExtractionResult {
290
290
  content,
@@ -90,28 +90,26 @@ fn find_sentence_end(text: &str, from: usize) -> Option<usize> {
90
90
  }
91
91
 
92
92
  // Look for sentence terminals: . ! ?
93
- match memchr3(b'.', b'!', b'?', &bytes[pos..]) {
94
- None => return None,
95
- Some(offset) => {
96
- let terminal_pos = pos + offset;
97
- // Consume consecutive terminals (e.g., "..." or "?!")
98
- let mut end = terminal_pos + 1;
99
- while end < bytes.len() && (bytes[end] == b'.' || bytes[end] == b'!' || bytes[end] == b'?') {
100
- end += 1;
101
- }
102
-
103
- // Consume closing quotes/brackets after terminal
104
- while end < bytes.len() && matches!(bytes[end], b'"' | b'\'' | b')' | b']' | b'}') {
105
- end += 1;
106
- }
93
+ {
94
+ let offset = memchr3(b'.', b'!', b'?', &bytes[pos..])?;
95
+ let terminal_pos = pos + offset;
96
+ // Consume consecutive terminals (e.g., "..." or "?!")
97
+ let mut end = terminal_pos + 1;
98
+ while end < bytes.len() && (bytes[end] == b'.' || bytes[end] == b'!' || bytes[end] == b'?') {
99
+ end += 1;
100
+ }
107
101
 
108
- // Check if this is a real sentence boundary
109
- if is_sentence_boundary(text, terminal_pos, end) {
110
- return Some(end);
111
- }
102
+ // Consume closing quotes/brackets after terminal
103
+ while end < bytes.len() && matches!(bytes[end], b'"' | b'\'' | b')' | b']' | b'}') {
104
+ end += 1;
105
+ }
112
106
 
113
- pos = end;
107
+ // Check if this is a real sentence boundary
108
+ if is_sentence_boundary(text, terminal_pos, end) {
109
+ return Some(end);
114
110
  }
111
+
112
+ pos = end;
115
113
  }
116
114
  }
117
115
 
@@ -320,12 +320,12 @@ pub fn extract_common_metadata_from_document(document: &PdfDocument<'_>) -> Resu
320
320
  let authors = metadata_cache[2]
321
321
  .as_ref()
322
322
  .map(|author_str| parse_authors(author_str))
323
- .and_then(|parsed| if !parsed.is_empty() { Some(parsed) } else { None });
323
+ .filter(|parsed| !parsed.is_empty());
324
324
 
325
325
  let keywords = metadata_cache[3]
326
326
  .as_ref()
327
327
  .map(|keywords_str| parse_keywords(keywords_str))
328
- .and_then(|parsed| if !parsed.is_empty() { Some(parsed) } else { None });
328
+ .filter(|parsed| !parsed.is_empty());
329
329
 
330
330
  let created_at = metadata_cache[4].as_ref().map(|date_str| parse_pdf_date(date_str));
331
331
 
@@ -94,11 +94,11 @@ fn extract_common_metadata(doc: &mut OxideDocument) -> Result<CommonPdfMetadata>
94
94
 
95
95
  let authors = get_info_string(&mut doc.doc, "Author")
96
96
  .map(|author_str| parse_authors(&author_str))
97
- .and_then(|parsed| if parsed.is_empty() { None } else { Some(parsed) });
97
+ .filter(|parsed| !parsed.is_empty());
98
98
 
99
99
  let keywords = get_info_string(&mut doc.doc, "Keywords")
100
100
  .map(|kw_str| parse_keywords(&kw_str))
101
- .and_then(|parsed| if parsed.is_empty() { None } else { Some(parsed) });
101
+ .filter(|parsed| !parsed.is_empty());
102
102
 
103
103
  let created_at = get_info_string(&mut doc.doc, "CreationDate").map(|d| parse_pdf_date(&d));
104
104
  let modified_at = get_info_string(&mut doc.doc, "ModDate").map(|d| parse_pdf_date(&d));
@@ -75,9 +75,7 @@ pub(crate) fn extract_tables_native(doc: &mut OxideDocument) -> Result<Vec<Table
75
75
  ///
76
76
  /// Maps rows/cells from the native table structure to a 2D `Vec<Vec<String>>`
77
77
  /// grid and builds a markdown representation with proper header separators.
78
- fn convert_extracted_table(
79
- table: &pdf_oxide::structure::table_extractor::ExtractedTable,
80
- ) -> (Vec<Vec<String>>, String) {
78
+ fn convert_extracted_table(table: &pdf_oxide::structure::table_extractor::Table) -> (Vec<Vec<String>>, String) {
81
79
  let mut cells: Vec<Vec<String>> = Vec::with_capacity(table.rows.len());
82
80
  let mut markdown = String::new();
83
81
  let mut found_header = false;
@@ -124,7 +122,7 @@ mod tests {
124
122
 
125
123
  #[test]
126
124
  fn test_convert_extracted_table_basic() {
127
- use pdf_oxide::structure::table_extractor::{ExtractedTable, TableCell, TableRow};
125
+ use pdf_oxide::structure::table_extractor::{Table as ExtractedTable, TableCell, TableRow};
128
126
 
129
127
  let table = ExtractedTable {
130
128
  rows: vec![
@@ -191,7 +189,7 @@ mod tests {
191
189
 
192
190
  #[test]
193
191
  fn test_convert_extracted_table_no_header() {
194
- use pdf_oxide::structure::table_extractor::{ExtractedTable, TableCell, TableRow};
192
+ use pdf_oxide::structure::table_extractor::{Table as ExtractedTable, TableCell, TableRow};
195
193
 
196
194
  let table = ExtractedTable {
197
195
  rows: vec![
@@ -233,7 +231,7 @@ mod tests {
233
231
 
234
232
  #[test]
235
233
  fn test_convert_extracted_table_empty() {
236
- use pdf_oxide::structure::table_extractor::ExtractedTable;
234
+ use pdf_oxide::structure::table_extractor::Table as ExtractedTable;
237
235
 
238
236
  let table = ExtractedTable {
239
237
  rows: vec![],
@@ -1465,9 +1465,11 @@ pub(crate) fn extract_document_structure_from_segments(
1465
1465
  .collect();
1466
1466
 
1467
1467
  // Stage 3: Per-page structured extraction.
1468
- // When the structure tree provides heading roles, skip layout-model heading
1469
- // overrides — they can demote correctly-tagged headings. The tree is authoritative.
1470
- let effective_layout_hints = if used_structure_tree { None } else { layout_hints };
1468
+ // Always pass layout hints regardless of structure tree status. Layout hints
1469
+ // provide multi-purpose classification (furniture/header/footer marking, table
1470
+ // regions, list items) beyond just heading overrides. The structure tree's
1471
+ // heading roles are still respected via assigned_role on segments.
1472
+ let effective_layout_hints = layout_hints;
1471
1473
  let page_inputs: Vec<PageInput> = (0..page_count)
1472
1474
  .map(|i| PageInput {
1473
1475
  page_index: i,
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-ffi"
3
- version = "4.9.0"
3
+ version = "4.9.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -28,7 +28,7 @@ ahash = { version = "0.8.12", features = ["serde"] }
28
28
  async-trait = "0.1.89"
29
29
  ctor = "0.10"
30
30
  html-to-markdown-rs = { version = "3.2.5", default-features = false }
31
- kreuzberg = { path = "../kreuzberg", version = "4.9.0", default-features = false, features = ["bundled-pdfium", "full"] }
31
+ kreuzberg = { path = "../kreuzberg", version = "4.9.1", default-features = false, features = ["bundled-pdfium", "full"] }
32
32
  log = "0.4"
33
33
  rayon = { version = "1.12.0", optional = true }
34
34
  serde = { version = "1.0.228", features = ["derive"] }
@@ -9,8 +9,8 @@
9
9
 
10
10
  #define KREUZBERG_VERSION_MAJOR 4
11
11
  #define KREUZBERG_VERSION_MINOR 9
12
- #define KREUZBERG_VERSION_PATCH 0
13
- #define KREUZBERG_VERSION "4.9.0"
12
+ #define KREUZBERG_VERSION_PATCH 1
13
+ #define KREUZBERG_VERSION "4.9.1"
14
14
 
15
15
 
16
16
  #include <stdarg.h>
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-paddle-ocr"
3
- version = "4.9.0"
3
+ version = "4.9.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-pdfium-render"
3
- version = "4.9.0"
3
+ version = "4.9.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.9.0"
3
+ version = "4.9.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -613,8 +613,11 @@ mod build_tesseract {
613
613
  let eng_traineddata = bundled_tessdata_dir.join("eng.traineddata");
614
614
  if !eng_traineddata.exists() {
615
615
  fs::create_dir_all(&bundled_tessdata_dir).expect("Failed to create tessdata directory");
616
- download_file(
617
- "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata",
616
+ download_file_with_fallback(
617
+ &[
618
+ "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata",
619
+ "https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main/eng.traineddata",
620
+ ],
618
621
  &eng_traineddata,
619
622
  "eng.traineddata",
620
623
  );
@@ -911,49 +914,66 @@ mod build_tesseract {
911
914
  }
912
915
 
913
916
  /// Download a single file to a destination path with retries.
914
- fn download_file(url: &str, dest: &Path, label: &str) {
917
+ /// Download a single file, trying each URL in order. Each URL gets up to
918
+ /// `max_attempts` retries with exponential backoff before falling through
919
+ /// to the next URL.
920
+ fn download_file_with_fallback(urls: &[&str], dest: &Path, label: &str) {
915
921
  let client = reqwest::blocking::Client::builder()
916
- .timeout(std::time::Duration::from_secs(120))
922
+ .timeout(std::time::Duration::from_secs(300))
917
923
  .http1_only()
918
924
  .build()
919
925
  .expect("Failed to create HTTP client");
920
926
 
921
- eprintln!("Downloading {} from {}", label, url);
922
- let max_attempts = 3;
923
-
924
- for attempt in 1..=max_attempts {
925
- let err_msg = match client.get(url).send() {
926
- Ok(resp) => {
927
- if resp.status().is_success() {
928
- match resp.bytes() {
929
- Ok(bytes) => {
930
- fs::write(dest, &bytes).expect("Failed to write downloaded file");
931
- eprintln!("Downloaded {} ({} bytes)", label, bytes.len());
932
- return;
927
+ let max_attempts: u32 = 5;
928
+ let mut last_err = String::new();
929
+
930
+ for url in urls {
931
+ eprintln!("Downloading {} from {}", label, url);
932
+
933
+ for attempt in 1..=max_attempts {
934
+ let err_msg = match client.get(*url).send() {
935
+ Ok(resp) => {
936
+ if resp.status().is_success() {
937
+ match resp.bytes() {
938
+ Ok(bytes) => {
939
+ fs::write(dest, &bytes).expect("Failed to write downloaded file");
940
+ eprintln!("Downloaded {} ({} bytes)", label, bytes.len());
941
+ return;
942
+ }
943
+ Err(err) => format!("Failed to read response: {}", err),
933
944
  }
934
- Err(err) => format!("Failed to read response: {}", err),
945
+ } else {
946
+ format!("HTTP {}", resp.status().as_u16())
935
947
  }
936
- } else {
937
- format!("HTTP {}", resp.status().as_u16())
938
948
  }
949
+ Err(err) => err.to_string(),
950
+ };
951
+
952
+ last_err = err_msg.clone();
953
+
954
+ if attempt == max_attempts {
955
+ println!(
956
+ "cargo:warning=All {} attempts for {} exhausted on URL {}",
957
+ max_attempts, label, url
958
+ );
959
+ break;
939
960
  }
940
- Err(err) => err.to_string(),
941
- };
942
961
 
943
- if attempt == max_attempts {
944
- panic!(
945
- "Failed to download {} after {} attempts: {}",
946
- label, max_attempts, err_msg
962
+ let backoff = 2u64.pow((attempt - 1).min(4));
963
+ println!(
964
+ "cargo:warning=Download attempt {}/{} for {} failed ({}). Retrying in {}s...",
965
+ attempt, max_attempts, label, err_msg, backoff
947
966
  );
967
+ std::thread::sleep(std::time::Duration::from_secs(backoff));
948
968
  }
949
-
950
- let backoff = 2u64.pow((attempt - 1).min(3));
951
- println!(
952
- "cargo:warning=Download attempt {}/{} for {} failed ({}). Retrying in {}s...",
953
- attempt, max_attempts, label, err_msg, backoff
954
- );
955
- std::thread::sleep(std::time::Duration::from_secs(backoff));
956
969
  }
970
+
971
+ panic!(
972
+ "Failed to download {} after trying {} URL(s): {}",
973
+ label,
974
+ urls.len(),
975
+ last_err
976
+ );
957
977
  }
958
978
 
959
979
  fn normalize_cmake_path(path: &Path) -> String {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.9.0
4
+ version: 4.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-18 00:00:00.000000000 Z
11
+ date: 2026-04-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys