kreuzberg 4.8.1 → 4.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/src/config/types.rs +10 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +41 -1
- data/vendor/kreuzberg/tests/api_consistency.rs +1 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +26 -2
- data/vendor/kreuzberg-ffi/src/config_builder.rs +87 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4ec647109dee8229fd50ce9fe4d87f13bbb769779b22b4862cac321052610d6c
|
|
4
|
+
data.tar.gz: 18c3cf6df8e339f4286da7fe9b8da84185fa93abf89fcbf339b259c0095a9a5c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f8c0ab16048bdb9026b55ff15f3ae342af9178a2ab6b1fd85777c3271da35e46474bee025af429745d20584604d520d85a4c8e4ea96bffc49ef5dfca55471b6f
|
|
7
|
+
data.tar.gz: 46ec0ff10138bd48d7b9ffada23956cd557616d8fdf09090f16562d4daab04381176b2881e08bfdfc1f726a0061e1e25feefa695620fef530444662abb866605
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -1005,6 +1005,16 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
|
|
|
1005
1005
|
config.html_options = Some(parse_html_options(ruby, html_hash)?);
|
|
1006
1006
|
}
|
|
1007
1007
|
|
|
1008
|
+
if let Some(val) = get_kw(ruby, hash, "html_output")
|
|
1009
|
+
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
1010
|
+
{
|
|
1011
|
+
let html_output_json = ruby_value_to_json(val)?;
|
|
1012
|
+
let parsed: kreuzberg::core::config::html_output::HtmlOutputConfig =
|
|
1013
|
+
serde_json::from_value(html_output_json)
|
|
1014
|
+
.map_err(|e| runtime_error(format!("Invalid html_output: {}", e)))?;
|
|
1015
|
+
config.html_output = Some(parsed);
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1008
1018
|
if let Some(val) = get_kw(ruby, hash, "pages")
|
|
1009
1019
|
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
1010
1020
|
{
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.8.
|
|
5
|
+
version = "4.8.2"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
|
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.8.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.8.2", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.2" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.184"
|
|
39
39
|
liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.8.
|
|
21
|
+
> **🚀 Version 4.8.2 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -1240,7 +1240,10 @@ pub fn extract_document_structure(
|
|
|
1240
1240
|
// Deduplicate paragraphs with identical text within each page.
|
|
1241
1241
|
// Catches bold/shadow rendering artifacts (consecutive duplicates)
|
|
1242
1242
|
// and table content rendered as both table and body text.
|
|
1243
|
-
|
|
1243
|
+
// When strip_repeating_text is disabled, skip dedup to preserve all content.
|
|
1244
|
+
if strip_repeating_text {
|
|
1245
|
+
deduplicate_paragraphs(&mut all_page_paragraphs);
|
|
1246
|
+
}
|
|
1244
1247
|
|
|
1245
1248
|
let total_paragraphs: usize = all_page_paragraphs.iter().map(|p| p.len()).sum();
|
|
1246
1249
|
tracing::debug!(
|
|
@@ -2291,4 +2294,41 @@ mod tests {
|
|
|
2291
2294
|
assert!(paras[0].is_page_furniture);
|
|
2292
2295
|
assert!(paras[1].is_page_furniture);
|
|
2293
2296
|
}
|
|
2297
|
+
|
|
2298
|
+
#[test]
|
|
2299
|
+
fn test_deduplicate_paragraphs_removes_consecutive_duplicates() {
|
|
2300
|
+
let p1 = para(vec![line(vec![full_line_seg("Brand loses market share")])]);
|
|
2301
|
+
let p2 = para(vec![line(vec![full_line_seg("Brand loses market share")])]);
|
|
2302
|
+
let p3 = para(vec![line(vec![full_line_seg("Different content here")])]);
|
|
2303
|
+
let mut pages = vec![vec![p1, p2, p3]];
|
|
2304
|
+
deduplicate_paragraphs(&mut pages);
|
|
2305
|
+
assert_eq!(pages[0].len(), 2, "consecutive duplicate should be removed");
|
|
2306
|
+
}
|
|
2307
|
+
|
|
2308
|
+
#[test]
|
|
2309
|
+
fn test_deduplicate_paragraphs_removes_non_consecutive_body_duplicates() {
|
|
2310
|
+
let p1 = para(vec![line(vec![full_line_seg("Brand loses market share in volume")])]);
|
|
2311
|
+
let p2 = para(vec![line(vec![full_line_seg("Some intervening paragraph")])]);
|
|
2312
|
+
let p3 = para(vec![line(vec![full_line_seg("Brand loses market share in volume")])]);
|
|
2313
|
+
let mut pages = vec![vec![p1, p2, p3]];
|
|
2314
|
+
deduplicate_paragraphs(&mut pages);
|
|
2315
|
+
assert_eq!(pages[0].len(), 2, "non-consecutive body duplicate should be removed");
|
|
2316
|
+
}
|
|
2317
|
+
|
|
2318
|
+
#[test]
|
|
2319
|
+
fn test_deduplicate_paragraphs_preserves_non_consecutive_headings() {
|
|
2320
|
+
// Pass 2 (non-consecutive dedup) skips headings via is_dedup_candidate.
|
|
2321
|
+
let mut h = para(vec![line(vec![full_line_seg("Brand loses market share in volume")])]);
|
|
2322
|
+
h.heading_level = Some(2);
|
|
2323
|
+
let filler = para(vec![line(vec![full_line_seg("Some other content between them")])]);
|
|
2324
|
+
let mut h2 = para(vec![line(vec![full_line_seg("Brand loses market share in volume")])]);
|
|
2325
|
+
h2.heading_level = Some(2);
|
|
2326
|
+
let mut pages = vec![vec![h, filler, h2]];
|
|
2327
|
+
deduplicate_paragraphs(&mut pages);
|
|
2328
|
+
assert_eq!(
|
|
2329
|
+
pages[0].len(),
|
|
2330
|
+
3,
|
|
2331
|
+
"non-consecutive heading duplicates must be preserved"
|
|
2332
|
+
);
|
|
2333
|
+
}
|
|
2294
2334
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-ffi"
|
|
3
|
-
version = "4.8.
|
|
3
|
+
version = "4.8.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -41,7 +41,7 @@ serde_json = { version = "1.0.149" }
|
|
|
41
41
|
tokio = { version = "1.51.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
42
42
|
|
|
43
43
|
[target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
|
|
44
|
-
kreuzberg = { path = "../kreuzberg", version = "4.8.
|
|
44
|
+
kreuzberg = { path = "../kreuzberg", version = "4.8.2", default-features = false, features = [
|
|
45
45
|
"pdf",
|
|
46
46
|
"excel",
|
|
47
47
|
"office",
|
|
@@ -64,7 +64,7 @@ kreuzberg = { path = "../kreuzberg", version = "4.8.1", default-features = false
|
|
|
64
64
|
] }
|
|
65
65
|
|
|
66
66
|
[target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
|
|
67
|
-
kreuzberg = { path = "../kreuzberg", version = "4.8.
|
|
67
|
+
kreuzberg = { path = "../kreuzberg", version = "4.8.2", default-features = false, features = ["bundled-pdfium", "full"] }
|
|
68
68
|
|
|
69
69
|
[build-dependencies]
|
|
70
70
|
cbindgen = "0.29"
|
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
|
|
10
10
|
#define KREUZBERG_VERSION_MAJOR 4
|
|
11
11
|
#define KREUZBERG_VERSION_MINOR 8
|
|
12
|
-
#define KREUZBERG_VERSION_PATCH
|
|
13
|
-
#define KREUZBERG_VERSION "4.8.
|
|
12
|
+
#define KREUZBERG_VERSION_PATCH 2
|
|
13
|
+
#define KREUZBERG_VERSION "4.8.2"
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
#include <stdarg.h>
|
|
@@ -1269,6 +1269,30 @@ KREUZBERG_EXPORT
|
|
|
1269
1269
|
int32_t kreuzberg_config_builder_set_content_filter(struct ConfigBuilder *builder,
|
|
1270
1270
|
const char *cf_json);
|
|
1271
1271
|
|
|
1272
|
+
/**
|
|
1273
|
+
* Set HTML output configuration from JSON.
|
|
1274
|
+
*
|
|
1275
|
+
* # Arguments
|
|
1276
|
+
*
|
|
1277
|
+
* * `builder` - Non-null pointer to ConfigBuilder
|
|
1278
|
+
* * `html_output_json` - JSON string for HTML output config
|
|
1279
|
+
*
|
|
1280
|
+
* # Returns
|
|
1281
|
+
*
|
|
1282
|
+
* 0 on success, -1 on error (check kreuzberg_last_error)
|
|
1283
|
+
*
|
|
1284
|
+
* # Safety
|
|
1285
|
+
*
|
|
1286
|
+
* This function is meant to be called from C/FFI code. The caller must ensure:
|
|
1287
|
+
* - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
1288
|
+
* - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
1289
|
+
* - `html_output_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
1290
|
+
* - The string pointer must remain valid for the duration of the function call
|
|
1291
|
+
*/
|
|
1292
|
+
KREUZBERG_EXPORT
|
|
1293
|
+
int32_t kreuzberg_config_builder_set_html_output(struct ConfigBuilder *builder,
|
|
1294
|
+
const char *html_output_json);
|
|
1295
|
+
|
|
1272
1296
|
/**
|
|
1273
1297
|
* Build the final ExtractionConfig and consume the builder.
|
|
1274
1298
|
*
|
|
@@ -14,7 +14,7 @@ use kreuzberg::core::config::LayoutDetectionConfig;
|
|
|
14
14
|
#[cfg(feature = "tree-sitter")]
|
|
15
15
|
use kreuzberg::core::config::TreeSitterConfig;
|
|
16
16
|
use kreuzberg::core::config::{
|
|
17
|
-
AccelerationConfig, ChunkingConfig, ContentFilterConfig, ExtractionConfig, ImageExtractionConfig,
|
|
17
|
+
AccelerationConfig, ChunkingConfig, ContentFilterConfig, ExtractionConfig, HtmlOutputConfig, ImageExtractionConfig,
|
|
18
18
|
LanguageDetectionConfig, OcrConfig, PdfConfig, PostProcessorConfig,
|
|
19
19
|
};
|
|
20
20
|
use std::ffi::{CStr, c_char};
|
|
@@ -131,6 +131,13 @@ impl ConfigBuilder {
|
|
|
131
131
|
Ok(())
|
|
132
132
|
}
|
|
133
133
|
|
|
134
|
+
fn set_html_output_from_json(&mut self, json: &str) -> Result<(), String> {
|
|
135
|
+
let html_output_config: HtmlOutputConfig =
|
|
136
|
+
serde_json::from_str(json).map_err(|e| format!("Failed to parse HTML output config JSON: {}", e))?;
|
|
137
|
+
self.config.html_output = Some(html_output_config);
|
|
138
|
+
Ok(())
|
|
139
|
+
}
|
|
140
|
+
|
|
134
141
|
fn build(self) -> ExtractionConfig {
|
|
135
142
|
self.config
|
|
136
143
|
}
|
|
@@ -854,6 +861,59 @@ pub unsafe extern "C" fn kreuzberg_config_builder_set_content_filter(
|
|
|
854
861
|
})
|
|
855
862
|
}
|
|
856
863
|
|
|
864
|
+
/// Set HTML output configuration from JSON.
|
|
865
|
+
///
|
|
866
|
+
/// # Arguments
|
|
867
|
+
///
|
|
868
|
+
/// * `builder` - Non-null pointer to ConfigBuilder
|
|
869
|
+
/// * `html_output_json` - JSON string for HTML output config
|
|
870
|
+
///
|
|
871
|
+
/// # Returns
|
|
872
|
+
///
|
|
873
|
+
/// 0 on success, -1 on error (check kreuzberg_last_error)
|
|
874
|
+
///
|
|
875
|
+
/// # Safety
|
|
876
|
+
///
|
|
877
|
+
/// This function is meant to be called from C/FFI code. The caller must ensure:
|
|
878
|
+
/// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
879
|
+
/// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
880
|
+
/// - `html_output_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
881
|
+
/// - The string pointer must remain valid for the duration of the function call
|
|
882
|
+
#[unsafe(no_mangle)]
|
|
883
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_set_html_output(
|
|
884
|
+
builder: *mut ConfigBuilder,
|
|
885
|
+
html_output_json: *const c_char,
|
|
886
|
+
) -> i32 {
|
|
887
|
+
ffi_panic_guard_i32!("kreuzberg_config_builder_set_html_output", {
|
|
888
|
+
if builder.is_null() {
|
|
889
|
+
set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
|
|
890
|
+
return -1;
|
|
891
|
+
}
|
|
892
|
+
if html_output_json.is_null() {
|
|
893
|
+
set_last_error("HTML output JSON cannot be NULL".to_string());
|
|
894
|
+
return -1;
|
|
895
|
+
}
|
|
896
|
+
|
|
897
|
+
clear_last_error();
|
|
898
|
+
|
|
899
|
+
let json_str = match unsafe { CStr::from_ptr(html_output_json) }.to_str() {
|
|
900
|
+
Ok(s) => s,
|
|
901
|
+
Err(e) => {
|
|
902
|
+
set_last_error(format!("Invalid UTF-8 in HTML output JSON: {}", e));
|
|
903
|
+
return -1;
|
|
904
|
+
}
|
|
905
|
+
};
|
|
906
|
+
|
|
907
|
+
match unsafe { (*builder).set_html_output_from_json(json_str) } {
|
|
908
|
+
Ok(()) => 0,
|
|
909
|
+
Err(e) => {
|
|
910
|
+
set_last_error(e);
|
|
911
|
+
-1
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
})
|
|
915
|
+
}
|
|
916
|
+
|
|
857
917
|
/// Build the final ExtractionConfig and consume the builder.
|
|
858
918
|
///
|
|
859
919
|
/// After calling this function, the builder pointer is invalid and must not be used.
|
|
@@ -1024,6 +1084,32 @@ mod tests {
|
|
|
1024
1084
|
}
|
|
1025
1085
|
}
|
|
1026
1086
|
|
|
1087
|
+
#[test]
|
|
1088
|
+
fn test_builder_with_html_output() {
|
|
1089
|
+
unsafe {
|
|
1090
|
+
let builder = kreuzberg_config_builder_new();
|
|
1091
|
+
assert!(!builder.is_null());
|
|
1092
|
+
|
|
1093
|
+
let html_json = CString::new(
|
|
1094
|
+
r#"{"theme":"github","class_prefix":"kb-","embed_css":true,"css":".kb-p { color: red; }"}"#,
|
|
1095
|
+
)
|
|
1096
|
+
.unwrap();
|
|
1097
|
+
let result = kreuzberg_config_builder_set_html_output(builder, html_json.as_ptr());
|
|
1098
|
+
assert_eq!(result, 0);
|
|
1099
|
+
|
|
1100
|
+
let config = kreuzberg_config_builder_build(builder);
|
|
1101
|
+
assert!(!config.is_null());
|
|
1102
|
+
|
|
1103
|
+
assert!((*config).html_output.is_some());
|
|
1104
|
+
let ho = (*config).html_output.as_ref().unwrap();
|
|
1105
|
+
assert_eq!(ho.css.as_deref(), Some(".kb-p { color: red; }"));
|
|
1106
|
+
assert!(ho.embed_css);
|
|
1107
|
+
|
|
1108
|
+
// Clean up
|
|
1109
|
+
let _ = Box::from_raw(config);
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1027
1113
|
#[test]
|
|
1028
1114
|
fn test_builder_invalid_json() {
|
|
1029
1115
|
unsafe {
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.8.
|
|
4
|
+
version: 4.8.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-10 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|