kreuzberg 4.9.4 → 4.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/src/config/types.rs +1 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +5 -5
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/ocr.rs +8 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +72 -14
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +27 -3
- data/vendor/kreuzberg/src/core/extractor/file.rs +27 -3
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +26 -20
- data/vendor/kreuzberg/src/doc_orientation.rs +1 -1
- data/vendor/kreuzberg/src/extraction/email.rs +72 -10
- data/vendor/kreuzberg/src/extraction/image.rs +2 -2
- data/vendor/kreuzberg/src/extraction/image_ocr.rs +6 -1
- data/vendor/kreuzberg/src/extraction/transform/content.rs +249 -4
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -5
- data/vendor/kreuzberg/src/extractors/email.rs +12 -11
- data/vendor/kreuzberg/src/extractors/hwp.rs +18 -5
- data/vendor/kreuzberg/src/extractors/image.rs +11 -6
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +16 -2
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +46 -16
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +26 -8
- data/vendor/kreuzberg/src/mcp/params.rs +17 -1
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +1 -0
- data/vendor/kreuzberg/src/ocr/types.rs +11 -1
- data/vendor/kreuzberg/src/ort_discovery.rs +74 -22
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +108 -10
- data/vendor/kreuzberg/src/pdf/images.rs +134 -8
- data/vendor/kreuzberg/src/pdf/structure/bridge.rs +4 -4
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +7 -3
- data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +9 -0
- data/vendor/kreuzberg/src/rendering/djot.rs +8 -0
- data/vendor/kreuzberg/src/rendering/markdown.rs +7 -0
- data/vendor/kreuzberg/src/rendering/plain.rs +16 -7
- data/vendor/kreuzberg/src/types/formats.rs +6 -2
- data/vendor/kreuzberg/src/utils/image_decode.rs +99 -0
- data/vendor/kreuzberg/src/utils/mod.rs +8 -0
- data/vendor/kreuzberg/tests/docx_ocr_integration_test.rs +84 -0
- data/vendor/kreuzberg/tests/email_integration.rs +18 -7
- data/vendor/kreuzberg/tests/extraction_timeout_tests.rs +92 -0
- data/vendor/kreuzberg/tests/gpu_acceleration.rs +419 -0
- data/vendor/kreuzberg/tests/issue_797_preset_embedding_regression.rs +75 -0
- data/vendor/kreuzberg/tests/markdown_lint_quality.rs +18 -6
- data/vendor/kreuzberg/tests/mcp_integration.rs +13 -5
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +16 -20
- data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +129 -0
- data/vendor/kreuzberg/tests/test_batch_extract_schema.rs +56 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/build.rs +5 -0
- metadata +8 -3
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6921
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9f3132b44aad1652c76e8b1445b775eb3586e48661908eda794c95339f06387d
|
|
4
|
+
data.tar.gz: 2f957af07040ec2f3bcd79c299dd429a752423d714eea73bfb608a28718a6c11
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 878748ecb791e049c2de05cdc4ec7b9f6749bb265981c98ea49126108ca7c2782b92a6b5ed31d1fbfbeee83e3c45c80aaf74aacecd20f9bc428d796709afa0aa
|
|
7
|
+
data.tar.gz: ff137eb78f8fcfcc2ac357b0d9adf6d3d6fee11a448679a976678e0745905a0abcd8abfeb331028d780810d96dc47a04ec01dda94c900ec63ad4b35c124c187f
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.6" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-rb"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.6"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -65,7 +65,7 @@ tokio = { version = "1.52.1", features = [
|
|
|
65
65
|
"time",
|
|
66
66
|
"io-util",
|
|
67
67
|
] }
|
|
68
|
-
html-to-markdown-rs = { version = "3.
|
|
68
|
+
html-to-markdown-rs = { version = "3.3.1", default-features = false }
|
|
69
69
|
|
|
70
70
|
[dev-dependencies]
|
|
71
71
|
pretty_assertions = "1.4"
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.9.
|
|
5
|
+
version = "4.9.6"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -28,12 +28,12 @@ dbase = "0.7"
|
|
|
28
28
|
futures = "0.3"
|
|
29
29
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
30
30
|
hex = "0.4.3"
|
|
31
|
-
html-to-markdown-rs = { version = "3.
|
|
31
|
+
html-to-markdown-rs = { version = "3.3.1", default-features = false }
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.6", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.6" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.185"
|
|
39
39
|
liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
|
|
@@ -57,7 +57,7 @@ thiserror = "2.0.18"
|
|
|
57
57
|
tokio = { version = "1.52.1", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
58
58
|
toml = "1.1.2"
|
|
59
59
|
tracing = "0.1"
|
|
60
|
-
tree-sitter-language-pack = { version = "1.
|
|
60
|
+
tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false }
|
|
61
61
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
|
62
62
|
wasm-bindgen-futures = "0.4"
|
|
63
63
|
web-sys = { version = "0.3", features = ["Blob", "File", "FileReader", "console", "TextDecoder", "ImageData", "Window", "Response"] }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.6"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -271,7 +271,7 @@ hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
|
|
271
271
|
"simd",
|
|
272
272
|
], optional = true }
|
|
273
273
|
hex = "0.4.3"
|
|
274
|
-
html-to-markdown-rs = { version = "3.
|
|
274
|
+
html-to-markdown-rs = { version = "3.3.1", default-features = false, features = [
|
|
275
275
|
"inline-images",
|
|
276
276
|
"metadata",
|
|
277
277
|
], optional = true }
|
|
@@ -392,7 +392,7 @@ optional = true
|
|
|
392
392
|
# Override getrandom to enable js feature for WASM targets
|
|
393
393
|
# This is needed because ring/rustls (via ureq) depend on getrandom without js feature
|
|
394
394
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
395
|
-
tree-sitter-language-pack = { version = "1.
|
|
395
|
+
tree-sitter-language-pack = { version = "1.7.0", features = ["serde"], default-features = false, optional = true }
|
|
396
396
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
|
397
397
|
|
|
398
398
|
[build-dependencies]
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.9.
|
|
21
|
+
> **🚀 Version 4.9.6 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -271,6 +271,13 @@ pub struct OcrConfig {
|
|
|
271
271
|
/// - `{{ language }}` — The document language code (e.g., "eng", "deu").
|
|
272
272
|
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
273
273
|
pub vlm_prompt: Option<String>,
|
|
274
|
+
|
|
275
|
+
/// Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
|
|
276
|
+
///
|
|
277
|
+
/// Not user-configurable via config files — injected at runtime from
|
|
278
|
+
/// `ExtractionConfig::acceleration` before each `process_image` call.
|
|
279
|
+
#[serde(skip)]
|
|
280
|
+
pub acceleration: Option<super::acceleration::AccelerationConfig>,
|
|
274
281
|
}
|
|
275
282
|
|
|
276
283
|
impl Default for OcrConfig {
|
|
@@ -288,6 +295,7 @@ impl Default for OcrConfig {
|
|
|
288
295
|
auto_rotate: false,
|
|
289
296
|
vlm_config: None,
|
|
290
297
|
vlm_prompt: None,
|
|
298
|
+
acceleration: None,
|
|
291
299
|
}
|
|
292
300
|
}
|
|
293
301
|
}
|
|
@@ -267,15 +267,10 @@ impl ChunkingConfig {
|
|
|
267
267
|
}
|
|
268
268
|
};
|
|
269
269
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
name: preset_name.clone(),
|
|
275
|
-
},
|
|
276
|
-
..EmbeddingConfig::default()
|
|
277
|
-
}),
|
|
278
|
-
};
|
|
270
|
+
// Preserve the caller's embedding choice, including None.
|
|
271
|
+
// Presets configure chunking parameters only; users must explicitly
|
|
272
|
+
// provide an EmbeddingConfig to opt into embedding generation.
|
|
273
|
+
let embedding = self.embedding.clone();
|
|
279
274
|
|
|
280
275
|
Self {
|
|
281
276
|
max_characters: preset.chunk_size,
|
|
@@ -568,11 +563,9 @@ mod tests {
|
|
|
568
563
|
let resolved = config.resolve_preset();
|
|
569
564
|
assert_eq!(resolved.max_characters, 1024);
|
|
570
565
|
assert_eq!(resolved.overlap, 100);
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
_ => panic!("Expected Preset model type"),
|
|
575
|
-
}
|
|
566
|
+
// Preset configures chunking parameters only; embedding stays None unless
|
|
567
|
+
// the caller explicitly provided one (#797).
|
|
568
|
+
assert!(resolved.embedding.is_none());
|
|
576
569
|
}
|
|
577
570
|
|
|
578
571
|
#[test]
|
|
@@ -686,4 +679,69 @@ mod tests {
|
|
|
686
679
|
_ => panic!("Expected Custom variant"),
|
|
687
680
|
}
|
|
688
681
|
}
|
|
682
|
+
|
|
683
|
+
// --- Issue #797 regression tests ---
|
|
684
|
+
|
|
685
|
+
/// Preset with no explicit embedding: embedding must remain None.
|
|
686
|
+
///
|
|
687
|
+
/// Before the fix, `resolve_preset()` would silently inject an
|
|
688
|
+
/// `EmbeddingConfig` whenever a preset was configured, causing every
|
|
689
|
+
/// chunk to have an unexpected `.embedding` field populated.
|
|
690
|
+
#[test]
|
|
691
|
+
#[cfg(feature = "embeddings")]
|
|
692
|
+
fn test_resolve_preset_does_not_inject_embedding_when_none() {
|
|
693
|
+
let config = ChunkingConfig {
|
|
694
|
+
preset: Some("multilingual".to_string()),
|
|
695
|
+
embedding: None,
|
|
696
|
+
..Default::default()
|
|
697
|
+
};
|
|
698
|
+
let resolved = config.resolve_preset();
|
|
699
|
+
assert!(
|
|
700
|
+
resolved.embedding.is_none(),
|
|
701
|
+
"preset alone must not inject an EmbeddingConfig (#797)"
|
|
702
|
+
);
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
/// Preset with an explicit embedding: the embedding must be preserved unchanged.
|
|
706
|
+
#[test]
|
|
707
|
+
#[cfg(feature = "embeddings")]
|
|
708
|
+
fn test_resolve_preset_preserves_explicit_embedding_config() {
|
|
709
|
+
let explicit = EmbeddingConfig {
|
|
710
|
+
model: EmbeddingModelType::Custom {
|
|
711
|
+
model_id: "my-org/model".to_string(),
|
|
712
|
+
dimensions: 768,
|
|
713
|
+
},
|
|
714
|
+
batch_size: 16,
|
|
715
|
+
..Default::default()
|
|
716
|
+
};
|
|
717
|
+
let config = ChunkingConfig {
|
|
718
|
+
preset: Some("multilingual".to_string()),
|
|
719
|
+
embedding: Some(explicit),
|
|
720
|
+
..Default::default()
|
|
721
|
+
};
|
|
722
|
+
let resolved = config.resolve_preset();
|
|
723
|
+
let emb = resolved
|
|
724
|
+
.embedding
|
|
725
|
+
.expect("explicit embedding must survive resolve_preset");
|
|
726
|
+
assert_eq!(emb.batch_size, 16);
|
|
727
|
+
match emb.model {
|
|
728
|
+
EmbeddingModelType::Custom { model_id, dimensions } => {
|
|
729
|
+
assert_eq!(model_id, "my-org/model");
|
|
730
|
+
assert_eq!(dimensions, 768);
|
|
731
|
+
}
|
|
732
|
+
other => panic!("expected Custom model type, got {other:?}"),
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/// No preset, no embedding: embedding must stay None (regression guard).
|
|
737
|
+
#[test]
|
|
738
|
+
fn test_resolve_preset_no_preset_no_embedding_stays_none() {
|
|
739
|
+
let config = ChunkingConfig {
|
|
740
|
+
preset: None,
|
|
741
|
+
embedding: None,
|
|
742
|
+
..Default::default()
|
|
743
|
+
};
|
|
744
|
+
let resolved = config.resolve_preset();
|
|
745
|
+
assert!(resolved.embedding.is_none(), "no-preset path must not touch embedding");
|
|
746
|
+
}
|
|
689
747
|
}
|
|
@@ -66,7 +66,7 @@ use super::file::extract_bytes_with_extractor;
|
|
|
66
66
|
pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
67
67
|
use crate::core::mime;
|
|
68
68
|
|
|
69
|
-
let
|
|
69
|
+
let extraction_future = async {
|
|
70
70
|
if config.force_ocr && config.effective_disable_ocr() {
|
|
71
71
|
return Err(crate::KreuzbergError::Validation {
|
|
72
72
|
message: "force_ocr and disable_ocr cannot both be true".to_string(),
|
|
@@ -105,8 +105,32 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
|
|
|
105
105
|
}
|
|
106
106
|
|
|
107
107
|
extract_bytes_with_extractor(content, &validated_mime, config).await
|
|
108
|
-
}
|
|
109
|
-
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
#[cfg(feature = "tokio-runtime")]
|
|
111
|
+
let result = if let Some(secs) = config.extraction_timeout_secs {
|
|
112
|
+
let start = std::time::Instant::now();
|
|
113
|
+
match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
|
|
114
|
+
Ok(inner) => inner,
|
|
115
|
+
Err(_elapsed) => {
|
|
116
|
+
if let Some(ref token) = config.cancel_token {
|
|
117
|
+
token.cancel();
|
|
118
|
+
}
|
|
119
|
+
Err(crate::KreuzbergError::Timeout {
|
|
120
|
+
elapsed_ms: start.elapsed().as_millis() as u64,
|
|
121
|
+
limit_ms: secs * 1000,
|
|
122
|
+
})
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
} else {
|
|
126
|
+
extraction_future.await
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
130
|
+
let result = {
|
|
131
|
+
let _ = config.extraction_timeout_secs;
|
|
132
|
+
extraction_future.await
|
|
133
|
+
};
|
|
110
134
|
|
|
111
135
|
#[cfg(feature = "otel")]
|
|
112
136
|
if let Err(ref e) = result {
|
|
@@ -82,7 +82,7 @@ pub async fn extract_file(
|
|
|
82
82
|
);
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
-
let
|
|
85
|
+
let extraction_future = async {
|
|
86
86
|
io::validate_file_exists(path)?;
|
|
87
87
|
|
|
88
88
|
if config.force_ocr && config.effective_disable_ocr() {
|
|
@@ -119,8 +119,32 @@ pub async fn extract_file(
|
|
|
119
119
|
}
|
|
120
120
|
|
|
121
121
|
extract_file_with_extractor(path, &detected_mime, config).await
|
|
122
|
-
}
|
|
123
|
-
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
#[cfg(feature = "tokio-runtime")]
|
|
125
|
+
let result = if let Some(secs) = config.extraction_timeout_secs {
|
|
126
|
+
let start = std::time::Instant::now();
|
|
127
|
+
match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
|
|
128
|
+
Ok(inner) => inner,
|
|
129
|
+
Err(_elapsed) => {
|
|
130
|
+
if let Some(ref token) = config.cancel_token {
|
|
131
|
+
token.cancel();
|
|
132
|
+
}
|
|
133
|
+
Err(crate::KreuzbergError::Timeout {
|
|
134
|
+
elapsed_ms: start.elapsed().as_millis() as u64,
|
|
135
|
+
limit_ms: secs * 1000,
|
|
136
|
+
})
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
} else {
|
|
140
|
+
extraction_future.await
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
144
|
+
let result = {
|
|
145
|
+
let _ = config.extraction_timeout_secs;
|
|
146
|
+
extraction_future.await
|
|
147
|
+
};
|
|
124
148
|
|
|
125
149
|
#[cfg(feature = "otel")]
|
|
126
150
|
if let Err(ref e) = result {
|
|
@@ -55,6 +55,8 @@ use initialization::{get_processors_from_cache, initialize_features, initialize_
|
|
|
55
55
|
)
|
|
56
56
|
))]
|
|
57
57
|
pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
58
|
+
#[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
|
|
59
|
+
let mut doc = doc;
|
|
58
60
|
// Pre-render markdown for the chunker's heading context resolution when:
|
|
59
61
|
// - Markdown chunking is configured
|
|
60
62
|
// - Output format is not already Markdown (which would produce formatted_content anyway)
|
|
@@ -100,7 +102,30 @@ pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> R
|
|
|
100
102
|
}
|
|
101
103
|
};
|
|
102
104
|
|
|
103
|
-
// 1.
|
|
105
|
+
// 1. Process extracted images with OCR if configured
|
|
106
|
+
#[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
|
|
107
|
+
if config.ocr.is_some() && !doc.images.is_empty() {
|
|
108
|
+
let images_to_process = std::mem::take(&mut doc.images);
|
|
109
|
+
match crate::extraction::image_ocr::process_images_with_ocr(
|
|
110
|
+
images_to_process,
|
|
111
|
+
config,
|
|
112
|
+
&mut doc.processing_warnings,
|
|
113
|
+
)
|
|
114
|
+
.await
|
|
115
|
+
{
|
|
116
|
+
Ok(processed) => {
|
|
117
|
+
doc.images = processed;
|
|
118
|
+
}
|
|
119
|
+
Err(e) => {
|
|
120
|
+
doc.processing_warnings.push(crate::types::ProcessingWarning {
|
|
121
|
+
source: std::borrow::Cow::Borrowed("image_ocr"),
|
|
122
|
+
message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// 2. Derive ExtractionResult from InternalDocument
|
|
104
129
|
let include_structure = config.include_document_structure;
|
|
105
130
|
let mut result =
|
|
106
131
|
crate::extraction::derive::derive_extraction_result(doc, include_structure, config.output_format.clone());
|
|
@@ -111,25 +136,6 @@ pub async fn run_pipeline(doc: InternalDocument, config: &ExtractionConfig) -> R
|
|
|
111
136
|
result.formatted_content = Some(html);
|
|
112
137
|
}
|
|
113
138
|
|
|
114
|
-
// 1.5. Process extracted images with OCR if configured
|
|
115
|
-
#[cfg(all(feature = "ocr", feature = "tokio-runtime"))]
|
|
116
|
-
if config.ocr.is_some() && result.images.as_ref().is_some_and(|imgs| !imgs.is_empty()) {
|
|
117
|
-
let images_to_process = result.images.take().unwrap_or_default();
|
|
118
|
-
match crate::extraction::image_ocr::process_images_with_ocr(images_to_process, config).await {
|
|
119
|
-
Ok(processed) => {
|
|
120
|
-
result.images = if processed.is_empty() { None } else { Some(processed) };
|
|
121
|
-
}
|
|
122
|
-
Err(e) => {
|
|
123
|
-
result
|
|
124
|
-
.processing_warnings
|
|
125
|
-
.push(crate::types::extraction::ProcessingWarning {
|
|
126
|
-
source: std::borrow::Cow::Borrowed("image_ocr"),
|
|
127
|
-
message: std::borrow::Cow::Owned(format!("Image OCR failed: {e}")),
|
|
128
|
-
});
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
|
|
133
139
|
// Temporarily store pre-rendered markdown for chunker heading context.
|
|
134
140
|
// Tracked separately so we can remove it after chunking — apply_output_format
|
|
135
141
|
// must not swap this into result.content when output_format is Plain.
|
|
@@ -229,7 +229,7 @@ pub fn resolve_cache_dir() -> PathBuf {
|
|
|
229
229
|
/// Returns `Ok(Some(rotated_bytes))` if rotation was applied,
|
|
230
230
|
/// `Ok(None)` if no rotation needed (0° or low confidence).
|
|
231
231
|
pub fn detect_and_rotate(detector: &DocOrientationDetector, image_bytes: &[u8]) -> Result<Option<Vec<u8>>> {
|
|
232
|
-
let img =
|
|
232
|
+
let img = crate::utils::image_decode::decode_with_pixel_cap(image_bytes)
|
|
233
233
|
.map_err(|e| KreuzbergError::Ocr {
|
|
234
234
|
message: format!("Failed to load image for orientation detection: {e}"),
|
|
235
235
|
source: None,
|
|
@@ -256,6 +256,47 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
|
|
|
256
256
|
}
|
|
257
257
|
// Extract HTML from nested message/rfc822 sub-messages.
|
|
258
258
|
collect_nested_message_html(&message, &mut all_html);
|
|
259
|
+
|
|
260
|
+
// Fallback: if no dedicated HTML body was found, check if the message
|
|
261
|
+
// parts include HTML content. For simple HTML emails, mail-parser might
|
|
262
|
+
// not expose HTML via body_html() but it's still in the parts.
|
|
263
|
+
if all_html.is_empty() {
|
|
264
|
+
use mail_parser::{MimeHeaders, PartType};
|
|
265
|
+
for part in &message.parts {
|
|
266
|
+
if let Some(ct) = part.content_type() {
|
|
267
|
+
let is_html = ct.subtype().map(|s| s.eq_ignore_ascii_case("html")).unwrap_or(false);
|
|
268
|
+
if is_html {
|
|
269
|
+
match &part.body {
|
|
270
|
+
PartType::Text(t) | PartType::Html(t) => {
|
|
271
|
+
all_html.push(t.to_string());
|
|
272
|
+
}
|
|
273
|
+
_ => {}
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Final fallback: if still no HTML found, manually extract body from raw bytes.
|
|
281
|
+
// Mail-parser sometimes doesn't parse simple single-part HTML emails correctly.
|
|
282
|
+
if all_html.is_empty()
|
|
283
|
+
&& let Ok(data_str) = std::str::from_utf8(&data)
|
|
284
|
+
{
|
|
285
|
+
// Find the blank line that separates headers from body
|
|
286
|
+
// Try both CRLF and LF line endings
|
|
287
|
+
let body = if let Some(pos) = data_str.find("\r\n\r\n") {
|
|
288
|
+
&data_str[pos + 4..]
|
|
289
|
+
} else if let Some(pos) = data_str.find("\n\n") {
|
|
290
|
+
&data_str[pos + 2..]
|
|
291
|
+
} else {
|
|
292
|
+
""
|
|
293
|
+
};
|
|
294
|
+
|
|
295
|
+
if !body.is_empty() {
|
|
296
|
+
all_html.push(body.to_string());
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
259
300
|
if all_html.is_empty() {
|
|
260
301
|
None
|
|
261
302
|
} else {
|
|
@@ -264,11 +305,27 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
|
|
|
264
305
|
};
|
|
265
306
|
|
|
266
307
|
let cleaned_text = if let Some(ref plain) = plain_text {
|
|
267
|
-
|
|
308
|
+
// If plain_text contains HTML tags, treat it as HTML
|
|
309
|
+
if plain.contains("<html") || plain.contains("<body") || plain.contains("<!DOCTYPE") {
|
|
310
|
+
clean_html_content(plain)
|
|
311
|
+
} else {
|
|
312
|
+
plain.clone()
|
|
313
|
+
}
|
|
268
314
|
} else if let Some(html) = &html_content {
|
|
269
315
|
clean_html_content(html)
|
|
270
316
|
} else {
|
|
271
|
-
|
|
317
|
+
// Last resort: if no plain text or extracted HTML, try body_text(0)
|
|
318
|
+
// which might contain HTML content for pure HTML emails
|
|
319
|
+
if let Some(text) = message.body_text(0) {
|
|
320
|
+
// Check if this is actually HTML content
|
|
321
|
+
if text.contains("<html") || text.contains("<body") || text.contains("<!DOCTYPE") {
|
|
322
|
+
clean_html_content(&text)
|
|
323
|
+
} else {
|
|
324
|
+
text.to_string()
|
|
325
|
+
}
|
|
326
|
+
} else {
|
|
327
|
+
String::new()
|
|
328
|
+
}
|
|
272
329
|
};
|
|
273
330
|
|
|
274
331
|
let mut attachments = Vec::with_capacity(message.attachments().count().min(20));
|
|
@@ -1310,7 +1367,18 @@ fn clean_html_content(html: &str) -> String {
|
|
|
1310
1367
|
return String::new();
|
|
1311
1368
|
}
|
|
1312
1369
|
|
|
1313
|
-
//
|
|
1370
|
+
// First try: regex-based HTML stripping (most reliable)
|
|
1371
|
+
let cleaned = script_regex().replace_all(html, "");
|
|
1372
|
+
let cleaned = style_regex().replace_all(&cleaned, "");
|
|
1373
|
+
let cleaned = html_tag_regex().replace_all(&cleaned, "");
|
|
1374
|
+
let cleaned = whitespace_regex().replace_all(&cleaned, " ");
|
|
1375
|
+
let text = cleaned.trim().to_string();
|
|
1376
|
+
|
|
1377
|
+
if !text.is_empty() {
|
|
1378
|
+
return text;
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1381
|
+
// Fallback: try html-to-markdown converter if regex stripping produced nothing
|
|
1314
1382
|
#[cfg(feature = "html")]
|
|
1315
1383
|
{
|
|
1316
1384
|
if let Ok(text) = crate::extraction::html::convert_html_to_markdown(
|
|
@@ -1325,13 +1393,7 @@ fn clean_html_content(html: &str) -> String {
|
|
|
1325
1393
|
}
|
|
1326
1394
|
}
|
|
1327
1395
|
|
|
1328
|
-
|
|
1329
|
-
let cleaned = script_regex().replace_all(html, "");
|
|
1330
|
-
let cleaned = style_regex().replace_all(&cleaned, "");
|
|
1331
|
-
let cleaned = html_tag_regex().replace_all(&cleaned, "");
|
|
1332
|
-
let cleaned = whitespace_regex().replace_all(&cleaned, " ");
|
|
1333
|
-
|
|
1334
|
-
cleaned.trim().to_string()
|
|
1396
|
+
String::new()
|
|
1335
1397
|
}
|
|
1336
1398
|
|
|
1337
1399
|
fn is_image_mime_type(mime_type: &str) -> bool {
|
|
@@ -342,8 +342,8 @@ pub fn load_image_for_ocr(image_bytes: &[u8]) -> Result<image::DynamicImage> {
|
|
|
342
342
|
} else if is_jbig2(image_bytes) {
|
|
343
343
|
decode_jbig2_to_gray(image_bytes).map(image::DynamicImage::ImageLuma8)
|
|
344
344
|
} else {
|
|
345
|
-
|
|
346
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to decode image: {}"
|
|
345
|
+
crate::utils::image_decode::decode_with_pixel_cap(image_bytes)
|
|
346
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to decode image: {e}")))
|
|
347
347
|
}
|
|
348
348
|
}
|
|
349
349
|
|
|
@@ -43,6 +43,7 @@ use crate::types::{ExtractedImage, ExtractionResult};
|
|
|
43
43
|
pub async fn process_images_with_ocr(
|
|
44
44
|
mut images: Vec<ExtractedImage>,
|
|
45
45
|
config: &crate::core::config::ExtractionConfig,
|
|
46
|
+
warnings: &mut Vec<crate::types::ProcessingWarning>,
|
|
46
47
|
) -> crate::Result<Vec<ExtractedImage>> {
|
|
47
48
|
if images.is_empty() || config.ocr.is_none() {
|
|
48
49
|
return Ok(images);
|
|
@@ -125,7 +126,11 @@ pub async fn process_images_with_ocr(
|
|
|
125
126
|
};
|
|
126
127
|
images[idx].ocr_result = Some(Box::new(extraction_result));
|
|
127
128
|
}
|
|
128
|
-
Err(
|
|
129
|
+
Err(e) => {
|
|
130
|
+
warnings.push(crate::types::ProcessingWarning {
|
|
131
|
+
source: std::borrow::Cow::Borrowed("image_ocr"),
|
|
132
|
+
message: std::borrow::Cow::Owned(format!("Image {} OCR failed: {}", idx, e)),
|
|
133
|
+
});
|
|
129
134
|
images[idx].ocr_result = None;
|
|
130
135
|
}
|
|
131
136
|
}
|