kreuzberg 4.8.1 → 4.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 051fc6ca84b8545cb7049bd336a4897888b98f125fe40bfa24c6ab2e15232114
4
- data.tar.gz: f64b385b1612258e73a6f771d107375ca1f892af7af199cbbaa8b19f184058df
3
+ metadata.gz: 4ec647109dee8229fd50ce9fe4d87f13bbb769779b22b4862cac321052610d6c
4
+ data.tar.gz: 18c3cf6df8e339f4286da7fe9b8da84185fa93abf89fcbf339b259c0095a9a5c
5
5
  SHA512:
6
- metadata.gz: 949b5cdda46928b7981d4178efae64d64568155431785520774343fd27b47d2ebc7432d859fd404ff7f405cc93be7dbfda704e618b2aa079ff7d9522f7dac4c4
7
- data.tar.gz: c6ea82f649544a0db19accb6b4eca03af5e9619321a973fe5cac9ad1524014bb766dc7edcec302bb2bef64c7b7ab0deb0ba1760fe80ccf2b5547c1bc11d4ad29
6
+ metadata.gz: f8c0ab16048bdb9026b55ff15f3ae342af9178a2ab6b1fd85777c3271da35e46474bee025af429745d20584604d520d85a4c8e4ea96bffc49ef5dfca55471b6f
7
+ data.tar.gz: 46ec0ff10138bd48d7b9ffada23956cd557616d8fdf09090f16562d4daab04381176b2881e08bfdfc1f726a0061e1e25feefa695620fef530444662abb866605
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.1" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.2" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.8.1"
3
+ version = "4.8.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1005,6 +1005,16 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
1005
1005
  config.html_options = Some(parse_html_options(ruby, html_hash)?);
1006
1006
  }
1007
1007
 
1008
+ if let Some(val) = get_kw(ruby, hash, "html_output")
1009
+ && val.equal(ruby.qnil()).ok() != Some(true)
1010
+ {
1011
+ let html_output_json = ruby_value_to_json(val)?;
1012
+ let parsed: kreuzberg::core::config::html_output::HtmlOutputConfig =
1013
+ serde_json::from_value(html_output_json)
1014
+ .map_err(|e| runtime_error(format!("Invalid html_output: {}", e)))?;
1015
+ config.html_output = Some(parsed);
1016
+ }
1017
+
1008
1018
  if let Some(val) = get_kw(ruby, hash, "pages")
1009
1019
  && val.equal(ruby.qnil()).ok() != Some(true)
1010
1020
  {
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.8.1'
4
+ VERSION = '4.8.2'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.8.1"
5
+ version = "4.8.2"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.8.1", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.1" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.8.2", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.2" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.184"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.8.1"
3
+ version = "4.8.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.8.1 Release**
21
+ > **🚀 Version 4.8.2 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -1240,7 +1240,10 @@ pub fn extract_document_structure(
1240
1240
  // Deduplicate paragraphs with identical text within each page.
1241
1241
  // Catches bold/shadow rendering artifacts (consecutive duplicates)
1242
1242
  // and table content rendered as both table and body text.
1243
- deduplicate_paragraphs(&mut all_page_paragraphs);
1243
+ // When strip_repeating_text is disabled, skip dedup to preserve all content.
1244
+ if strip_repeating_text {
1245
+ deduplicate_paragraphs(&mut all_page_paragraphs);
1246
+ }
1244
1247
 
1245
1248
  let total_paragraphs: usize = all_page_paragraphs.iter().map(|p| p.len()).sum();
1246
1249
  tracing::debug!(
@@ -2291,4 +2294,41 @@ mod tests {
2291
2294
  assert!(paras[0].is_page_furniture);
2292
2295
  assert!(paras[1].is_page_furniture);
2293
2296
  }
2297
+
2298
+ #[test]
2299
+ fn test_deduplicate_paragraphs_removes_consecutive_duplicates() {
2300
+ let p1 = para(vec![line(vec![full_line_seg("Brand loses market share")])]);
2301
+ let p2 = para(vec![line(vec![full_line_seg("Brand loses market share")])]);
2302
+ let p3 = para(vec![line(vec![full_line_seg("Different content here")])]);
2303
+ let mut pages = vec![vec![p1, p2, p3]];
2304
+ deduplicate_paragraphs(&mut pages);
2305
+ assert_eq!(pages[0].len(), 2, "consecutive duplicate should be removed");
2306
+ }
2307
+
2308
+ #[test]
2309
+ fn test_deduplicate_paragraphs_removes_non_consecutive_body_duplicates() {
2310
+ let p1 = para(vec![line(vec![full_line_seg("Brand loses market share in volume")])]);
2311
+ let p2 = para(vec![line(vec![full_line_seg("Some intervening paragraph")])]);
2312
+ let p3 = para(vec![line(vec![full_line_seg("Brand loses market share in volume")])]);
2313
+ let mut pages = vec![vec![p1, p2, p3]];
2314
+ deduplicate_paragraphs(&mut pages);
2315
+ assert_eq!(pages[0].len(), 2, "non-consecutive body duplicate should be removed");
2316
+ }
2317
+
2318
+ #[test]
2319
+ fn test_deduplicate_paragraphs_preserves_non_consecutive_headings() {
2320
+ // Pass 2 (non-consecutive dedup) skips headings via is_dedup_candidate.
2321
+ let mut h = para(vec![line(vec![full_line_seg("Brand loses market share in volume")])]);
2322
+ h.heading_level = Some(2);
2323
+ let filler = para(vec![line(vec![full_line_seg("Some other content between them")])]);
2324
+ let mut h2 = para(vec![line(vec![full_line_seg("Brand loses market share in volume")])]);
2325
+ h2.heading_level = Some(2);
2326
+ let mut pages = vec![vec![h, filler, h2]];
2327
+ deduplicate_paragraphs(&mut pages);
2328
+ assert_eq!(
2329
+ pages[0].len(),
2330
+ 3,
2331
+ "non-consecutive heading duplicates must be preserved"
2332
+ );
2333
+ }
2294
2334
  }
@@ -252,6 +252,7 @@ fn test_extraction_config_no_unknown_fields_in_default() {
252
252
  "keywords",
253
253
  "postprocessor",
254
254
  "html_options",
255
+ "html_output",
255
256
  "max_concurrent_extractions",
256
257
  "result_format",
257
258
  "output_format",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-ffi"
3
- version = "4.8.1"
3
+ version = "4.8.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -41,7 +41,7 @@ serde_json = { version = "1.0.149" }
41
41
  tokio = { version = "1.51.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
42
42
 
43
43
  [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
44
- kreuzberg = { path = "../kreuzberg", version = "4.8.1", default-features = false, features = [
44
+ kreuzberg = { path = "../kreuzberg", version = "4.8.2", default-features = false, features = [
45
45
  "pdf",
46
46
  "excel",
47
47
  "office",
@@ -64,7 +64,7 @@ kreuzberg = { path = "../kreuzberg", version = "4.8.1", default-features = false
64
64
  ] }
65
65
 
66
66
  [target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
67
- kreuzberg = { path = "../kreuzberg", version = "4.8.1", default-features = false, features = ["bundled-pdfium", "full"] }
67
+ kreuzberg = { path = "../kreuzberg", version = "4.8.2", default-features = false, features = ["bundled-pdfium", "full"] }
68
68
 
69
69
  [build-dependencies]
70
70
  cbindgen = "0.29"
@@ -9,8 +9,8 @@
9
9
 
10
10
  #define KREUZBERG_VERSION_MAJOR 4
11
11
  #define KREUZBERG_VERSION_MINOR 8
12
- #define KREUZBERG_VERSION_PATCH 1
13
- #define KREUZBERG_VERSION "4.8.1"
12
+ #define KREUZBERG_VERSION_PATCH 2
13
+ #define KREUZBERG_VERSION "4.8.2"
14
14
 
15
15
 
16
16
  #include <stdarg.h>
@@ -1269,6 +1269,30 @@ KREUZBERG_EXPORT
1269
1269
  int32_t kreuzberg_config_builder_set_content_filter(struct ConfigBuilder *builder,
1270
1270
  const char *cf_json);
1271
1271
 
1272
+ /**
1273
+ * Set HTML output configuration from JSON.
1274
+ *
1275
+ * # Arguments
1276
+ *
1277
+ * * `builder` - Non-null pointer to ConfigBuilder
1278
+ * * `html_output_json` - JSON string for HTML output config
1279
+ *
1280
+ * # Returns
1281
+ *
1282
+ * 0 on success, -1 on error (check kreuzberg_last_error)
1283
+ *
1284
+ * # Safety
1285
+ *
1286
+ * This function is meant to be called from C/FFI code. The caller must ensure:
1287
+ * - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
1288
+ * - The pointer must be properly aligned and point to a valid ConfigBuilder instance
1289
+ * - `html_output_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
1290
+ * - The string pointer must remain valid for the duration of the function call
1291
+ */
1292
+ KREUZBERG_EXPORT
1293
+ int32_t kreuzberg_config_builder_set_html_output(struct ConfigBuilder *builder,
1294
+ const char *html_output_json);
1295
+
1272
1296
  /**
1273
1297
  * Build the final ExtractionConfig and consume the builder.
1274
1298
  *
@@ -14,7 +14,7 @@ use kreuzberg::core::config::LayoutDetectionConfig;
14
14
  #[cfg(feature = "tree-sitter")]
15
15
  use kreuzberg::core::config::TreeSitterConfig;
16
16
  use kreuzberg::core::config::{
17
- AccelerationConfig, ChunkingConfig, ContentFilterConfig, ExtractionConfig, ImageExtractionConfig,
17
+ AccelerationConfig, ChunkingConfig, ContentFilterConfig, ExtractionConfig, HtmlOutputConfig, ImageExtractionConfig,
18
18
  LanguageDetectionConfig, OcrConfig, PdfConfig, PostProcessorConfig,
19
19
  };
20
20
  use std::ffi::{CStr, c_char};
@@ -131,6 +131,13 @@ impl ConfigBuilder {
131
131
  Ok(())
132
132
  }
133
133
 
134
+ fn set_html_output_from_json(&mut self, json: &str) -> Result<(), String> {
135
+ let html_output_config: HtmlOutputConfig =
136
+ serde_json::from_str(json).map_err(|e| format!("Failed to parse HTML output config JSON: {}", e))?;
137
+ self.config.html_output = Some(html_output_config);
138
+ Ok(())
139
+ }
140
+
134
141
  fn build(self) -> ExtractionConfig {
135
142
  self.config
136
143
  }
@@ -854,6 +861,59 @@ pub unsafe extern "C" fn kreuzberg_config_builder_set_content_filter(
854
861
  })
855
862
  }
856
863
 
864
+ /// Set HTML output configuration from JSON.
865
+ ///
866
+ /// # Arguments
867
+ ///
868
+ /// * `builder` - Non-null pointer to ConfigBuilder
869
+ /// * `html_output_json` - JSON string for HTML output config
870
+ ///
871
+ /// # Returns
872
+ ///
873
+ /// 0 on success, -1 on error (check kreuzberg_last_error)
874
+ ///
875
+ /// # Safety
876
+ ///
877
+ /// This function is meant to be called from C/FFI code. The caller must ensure:
878
+ /// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
879
+ /// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
880
+ /// - `html_output_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
881
+ /// - The string pointer must remain valid for the duration of the function call
882
+ #[unsafe(no_mangle)]
883
+ pub unsafe extern "C" fn kreuzberg_config_builder_set_html_output(
884
+ builder: *mut ConfigBuilder,
885
+ html_output_json: *const c_char,
886
+ ) -> i32 {
887
+ ffi_panic_guard_i32!("kreuzberg_config_builder_set_html_output", {
888
+ if builder.is_null() {
889
+ set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
890
+ return -1;
891
+ }
892
+ if html_output_json.is_null() {
893
+ set_last_error("HTML output JSON cannot be NULL".to_string());
894
+ return -1;
895
+ }
896
+
897
+ clear_last_error();
898
+
899
+ let json_str = match unsafe { CStr::from_ptr(html_output_json) }.to_str() {
900
+ Ok(s) => s,
901
+ Err(e) => {
902
+ set_last_error(format!("Invalid UTF-8 in HTML output JSON: {}", e));
903
+ return -1;
904
+ }
905
+ };
906
+
907
+ match unsafe { (*builder).set_html_output_from_json(json_str) } {
908
+ Ok(()) => 0,
909
+ Err(e) => {
910
+ set_last_error(e);
911
+ -1
912
+ }
913
+ }
914
+ })
915
+ }
916
+
857
917
  /// Build the final ExtractionConfig and consume the builder.
858
918
  ///
859
919
  /// After calling this function, the builder pointer is invalid and must not be used.
@@ -1024,6 +1084,32 @@ mod tests {
1024
1084
  }
1025
1085
  }
1026
1086
 
1087
+ #[test]
1088
+ fn test_builder_with_html_output() {
1089
+ unsafe {
1090
+ let builder = kreuzberg_config_builder_new();
1091
+ assert!(!builder.is_null());
1092
+
1093
+ let html_json = CString::new(
1094
+ r#"{"theme":"github","class_prefix":"kb-","embed_css":true,"css":".kb-p { color: red; }"}"#,
1095
+ )
1096
+ .unwrap();
1097
+ let result = kreuzberg_config_builder_set_html_output(builder, html_json.as_ptr());
1098
+ assert_eq!(result, 0);
1099
+
1100
+ let config = kreuzberg_config_builder_build(builder);
1101
+ assert!(!config.is_null());
1102
+
1103
+ assert!((*config).html_output.is_some());
1104
+ let ho = (*config).html_output.as_ref().unwrap();
1105
+ assert_eq!(ho.css.as_deref(), Some(".kb-p { color: red; }"));
1106
+ assert!(ho.embed_css);
1107
+
1108
+ // Clean up
1109
+ let _ = Box::from_raw(config);
1110
+ }
1111
+ }
1112
+
1027
1113
  #[test]
1028
1114
  fn test_builder_invalid_json() {
1029
1115
  unsafe {
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-paddle-ocr"
3
- version = "4.8.1"
3
+ version = "4.8.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-pdfium-render"
3
- version = "4.8.1"
3
+ version = "4.8.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.8.1"
3
+ version = "4.8.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.8.1
4
+ version: 4.8.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-09 00:00:00.000000000 Z
11
+ date: 2026-04-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys