kreuzberg 4.9.0 → 4.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +7 -7
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +4 -4
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/ocr.rs +33 -35
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -3
- data/vendor/kreuzberg/src/extractors/doc.rs +6 -1
- data/vendor/kreuzberg/src/extractors/excel.rs +3 -0
- data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +6 -1
- data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +6 -1
- data/vendor/kreuzberg/src/extractors/iwork/pages.rs +6 -1
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +18 -0
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +21 -2
- data/vendor/kreuzberg/src/extractors/ppt.rs +6 -1
- data/vendor/kreuzberg/src/keywords/yake/preprocessor.rs +17 -19
- data/vendor/kreuzberg/src/layout/model_manager.rs +10 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
- data/vendor/kreuzberg/src/pdf/oxide/metadata.rs +2 -2
- data/vendor/kreuzberg/src/pdf/oxide/table.rs +4 -6
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +630 -61
- data/vendor/kreuzberg/src/pdf/structure/regions/tables.rs +24 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +4 -4
- data/vendor/kreuzberg-ffi/src/error.rs +9 -8
- data/vendor/kreuzberg-ffi/tests/c/test_error.c +4 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/build.rs +52 -32
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 431f2a7c19d9a4404369b7c9fe80fd10233b34b3462cf3c0076aac9b7f3065cb
|
|
4
|
+
data.tar.gz: bf36fd9dd46ffb84374e8f64ffb1f61ac6774cc8dd56d43d404e0b8995813ffc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2cf3ad1e3edd0778537e542bcafc121b6cd14235e271d2b9b27747f7beab7a2899362426f450a06f9ff0d0d4365982acb68300e46568887a871ad1bda6767d00
|
|
7
|
+
data.tar.gz: e7bcbe73f50701fbd4a0cb7e75e588096c02202198b234ba6f830b846f4fabd95e051ed0c25ed4a1c1d72a933cf5ccf9d755129eb1dc54ea446a60d9d9946997
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -2916,7 +2916,7 @@ dependencies = [
|
|
|
2916
2916
|
|
|
2917
2917
|
[[package]]
|
|
2918
2918
|
name = "kreuzberg-rb"
|
|
2919
|
-
version = "4.
|
|
2919
|
+
version = "4.9.2"
|
|
2920
2920
|
dependencies = [
|
|
2921
2921
|
"async-trait",
|
|
2922
2922
|
"html-to-markdown-rs",
|
|
@@ -3040,9 +3040,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
|
|
|
3040
3040
|
|
|
3041
3041
|
[[package]]
|
|
3042
3042
|
name = "liter-llm"
|
|
3043
|
-
version = "1.2.
|
|
3043
|
+
version = "1.2.2"
|
|
3044
3044
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3045
|
-
checksum = "
|
|
3045
|
+
checksum = "4e4ce5d2d0b09f2e63537ba40b15b0a95c2d6818ed0454eb04d9593ba4a0cad3"
|
|
3046
3046
|
dependencies = [
|
|
3047
3047
|
"base64 0.22.1",
|
|
3048
3048
|
"bytes",
|
|
@@ -5734,9 +5734,9 @@ checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
|
|
|
5734
5734
|
|
|
5735
5735
|
[[package]]
|
|
5736
5736
|
name = "typenum"
|
|
5737
|
-
version = "1.
|
|
5737
|
+
version = "1.20.0"
|
|
5738
5738
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5739
|
-
checksum = "
|
|
5739
|
+
checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
|
|
5740
5740
|
|
|
5741
5741
|
[[package]]
|
|
5742
5742
|
name = "unicase"
|
|
@@ -6186,9 +6186,9 @@ dependencies = [
|
|
|
6186
6186
|
|
|
6187
6187
|
[[package]]
|
|
6188
6188
|
name = "web_atoms"
|
|
6189
|
-
version = "0.2.
|
|
6189
|
+
version = "0.2.4"
|
|
6190
6190
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6191
|
-
checksum = "
|
|
6191
|
+
checksum = "d7cff6eef815df1834fd250e3a2ff436044d82a9f1bc1980ca1dbdf07effc538"
|
|
6192
6192
|
dependencies = [
|
|
6193
6193
|
"phf",
|
|
6194
6194
|
"phf_codegen",
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.9.
|
|
5
|
+
version = "4.9.2"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.2.5", default-features = false }
|
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.2", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.2" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.185"
|
|
39
39
|
liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
|
|
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
|
|
|
45
45
|
once_cell = "1.21.4"
|
|
46
46
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
47
47
|
parking_lot = "0.12.5"
|
|
48
|
-
pdf_oxide = { version = "0.3.
|
|
48
|
+
pdf_oxide = { version = "0.3.34", default-features = false }
|
|
49
49
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
50
50
|
rayon = "1.12.0"
|
|
51
51
|
reqwest = { version = "0.13.2", default-features = false }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.9.
|
|
3
|
+
version = "4.9.2"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
314
314
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
315
315
|
parking_lot = "0.12.5"
|
|
316
316
|
pastey = "0.2"
|
|
317
|
-
pdf_oxide = { version = "0.3.
|
|
317
|
+
pdf_oxide = { version = "0.3.34", default-features = false, optional = true }
|
|
318
318
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
319
319
|
pulldown-cmark = { version = "0.13" }
|
|
320
320
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.9.
|
|
21
|
+
> **🚀 Version 4.9.2 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -323,9 +323,12 @@ impl OcrConfig {
|
|
|
323
323
|
/// Returns the effective pipeline config.
|
|
324
324
|
///
|
|
325
325
|
/// - If `pipeline` is explicitly set, returns it.
|
|
326
|
-
/// - If `paddle-ocr`
|
|
327
|
-
/// auto-constructs
|
|
328
|
-
/// - Otherwise returns `None` (single-backend mode
|
|
326
|
+
/// - If `paddle-ocr` is compiled in and the backend is the default
|
|
327
|
+
/// (tesseract), auto-constructs `[tesseract @ 100, paddleocr @ 50]`.
|
|
328
|
+
/// - Otherwise returns `None` (single-backend mode).
|
|
329
|
+
///
|
|
330
|
+
/// Explicit non-default backend selections are honored as-is — a silent
|
|
331
|
+
/// paddleocr fallback would mask errors from the chosen backend.
|
|
329
332
|
pub fn effective_pipeline(&self) -> Option<OcrPipelineConfig> {
|
|
330
333
|
if self.pipeline.is_some() {
|
|
331
334
|
return self.pipeline.clone();
|
|
@@ -333,25 +336,28 @@ impl OcrConfig {
|
|
|
333
336
|
|
|
334
337
|
#[cfg(feature = "paddle-ocr")]
|
|
335
338
|
{
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
339
|
+
if self.backend != default_tesseract_backend() {
|
|
340
|
+
return None;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
let stages = vec![
|
|
344
|
+
OcrPipelineStage {
|
|
345
|
+
backend: self.backend.clone(),
|
|
346
|
+
priority: 100,
|
|
347
|
+
language: None,
|
|
348
|
+
tesseract_config: self.tesseract_config.clone(),
|
|
349
|
+
paddle_ocr_config: None,
|
|
350
|
+
vlm_config: self.vlm_config.clone(),
|
|
351
|
+
},
|
|
352
|
+
OcrPipelineStage {
|
|
347
353
|
backend: "paddleocr".to_string(),
|
|
348
354
|
priority: 50,
|
|
349
355
|
language: None,
|
|
350
356
|
tesseract_config: None,
|
|
351
357
|
paddle_ocr_config: self.paddle_ocr_config.clone(),
|
|
352
358
|
vlm_config: None,
|
|
353
|
-
}
|
|
354
|
-
|
|
359
|
+
},
|
|
360
|
+
];
|
|
355
361
|
Some(OcrPipelineConfig {
|
|
356
362
|
stages,
|
|
357
363
|
quality_thresholds: self.effective_thresholds(),
|
|
@@ -485,29 +491,21 @@ mod tests {
|
|
|
485
491
|
}
|
|
486
492
|
|
|
487
493
|
#[test]
|
|
488
|
-
fn
|
|
489
|
-
// When primary backend is "paddleocr", effective_pipeline should NOT add
|
|
490
|
-
// a second paddleocr stage (issue #6 fix).
|
|
494
|
+
fn test_effective_pipeline_explicit_paddleocr_no_autofallback() {
|
|
491
495
|
let config = OcrConfig {
|
|
492
496
|
backend: "paddleocr".to_string(),
|
|
493
497
|
..Default::default()
|
|
494
498
|
};
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
);
|
|
506
|
-
}
|
|
507
|
-
#[cfg(not(feature = "paddle-ocr"))]
|
|
508
|
-
{
|
|
509
|
-
assert!(result.is_none());
|
|
510
|
-
}
|
|
499
|
+
assert!(config.effective_pipeline().is_none());
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
#[test]
|
|
503
|
+
fn test_effective_pipeline_explicit_easyocr_no_autofallback() {
|
|
504
|
+
let config = OcrConfig {
|
|
505
|
+
backend: "easyocr".to_string(),
|
|
506
|
+
..Default::default()
|
|
507
|
+
};
|
|
508
|
+
assert!(config.effective_pipeline().is_none());
|
|
511
509
|
}
|
|
512
510
|
|
|
513
511
|
#[test]
|
|
@@ -282,9 +282,7 @@ fn extract_pptx_from_container<R: std::io::Read + std::io::Seek>(
|
|
|
282
282
|
}),
|
|
283
283
|
});
|
|
284
284
|
|
|
285
|
-
let document = doc_builder
|
|
286
|
-
.map(|b| b.build())
|
|
287
|
-
.and_then(|d| if d.is_empty() { None } else { Some(d) });
|
|
285
|
+
let document = doc_builder.map(|b| b.build()).filter(|d| !d.is_empty());
|
|
288
286
|
|
|
289
287
|
Ok(PptxExtractionResult {
|
|
290
288
|
content,
|
|
@@ -85,7 +85,12 @@ impl DocumentExtractor for DocExtractor {
|
|
|
85
85
|
}
|
|
86
86
|
|
|
87
87
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
88
|
-
|
|
88
|
+
{
|
|
89
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
90
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
91
|
+
}
|
|
92
|
+
extract_doc_text(content)
|
|
93
|
+
}
|
|
89
94
|
}?;
|
|
90
95
|
|
|
91
96
|
let mut doc = InternalDocument::new("doc");
|
|
@@ -215,6 +215,9 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
215
215
|
}
|
|
216
216
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
217
217
|
{
|
|
218
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
219
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
220
|
+
}
|
|
218
221
|
crate::extraction::excel::read_excel_bytes(content, extension)?
|
|
219
222
|
}
|
|
220
223
|
};
|
|
@@ -172,7 +172,12 @@ impl DocumentExtractor for KeynoteExtractor {
|
|
|
172
172
|
}
|
|
173
173
|
|
|
174
174
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
175
|
-
|
|
175
|
+
{
|
|
176
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
177
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
178
|
+
}
|
|
179
|
+
parse_keynote(content)?
|
|
180
|
+
}
|
|
176
181
|
};
|
|
177
182
|
|
|
178
183
|
let mut doc = build_keynote_internal_document(&data);
|
|
@@ -181,7 +181,12 @@ impl DocumentExtractor for NumbersExtractor {
|
|
|
181
181
|
}
|
|
182
182
|
|
|
183
183
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
184
|
-
|
|
184
|
+
{
|
|
185
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
186
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
187
|
+
}
|
|
188
|
+
parse_numbers(content)?
|
|
189
|
+
}
|
|
185
190
|
};
|
|
186
191
|
|
|
187
192
|
let mut doc = build_numbers_internal_document(&data);
|
|
@@ -163,7 +163,12 @@ impl DocumentExtractor for PagesExtractor {
|
|
|
163
163
|
}
|
|
164
164
|
|
|
165
165
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
166
|
-
|
|
166
|
+
{
|
|
167
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
168
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
169
|
+
}
|
|
170
|
+
parse_pages(content)?
|
|
171
|
+
}
|
|
167
172
|
};
|
|
168
173
|
|
|
169
174
|
let mut doc = build_pages_internal_document(&data);
|
|
@@ -420,6 +420,10 @@ pub(crate) fn extract_all_from_oxide_document(
|
|
|
420
420
|
content: &[u8],
|
|
421
421
|
config: &ExtractionConfig,
|
|
422
422
|
layout_hints: Option<&[Vec<crate::pdf::structure::types::LayoutHint>]>,
|
|
423
|
+
#[cfg(feature = "layout-detection")] layout_images: Option<&[image::DynamicImage]>,
|
|
424
|
+
#[cfg(not(feature = "layout-detection"))] _layout_images: Option<()>,
|
|
425
|
+
#[cfg(feature = "layout-detection")] layout_results: Option<&[crate::pdf::layout_runner::PageLayoutResult]>,
|
|
426
|
+
#[cfg(not(feature = "layout-detection"))] _layout_results: Option<()>,
|
|
423
427
|
) -> Result<PdfExtractionPhaseResult> {
|
|
424
428
|
let _span = tracing::debug_span!("extract_pdf_oxide").entered();
|
|
425
429
|
|
|
@@ -460,6 +464,11 @@ pub(crate) fn extract_all_from_oxide_document(
|
|
|
460
464
|
OutputFormat::Markdown | OutputFormat::Djot | OutputFormat::Html
|
|
461
465
|
);
|
|
462
466
|
|
|
467
|
+
let allow_single_column = config
|
|
468
|
+
.pdf_options
|
|
469
|
+
.as_ref()
|
|
470
|
+
.is_some_and(|o| o.allow_single_column_tables);
|
|
471
|
+
|
|
463
472
|
let pre_rendered_doc =
|
|
464
473
|
if needs_structured && !config.force_ocr {
|
|
465
474
|
let k = config
|
|
@@ -503,6 +512,15 @@ pub(crate) fn extract_all_from_oxide_document(
|
|
|
503
512
|
used_structure_tree,
|
|
504
513
|
image_positions: &image_positions,
|
|
505
514
|
layout_hints,
|
|
515
|
+
allow_single_column,
|
|
516
|
+
#[cfg(feature = "layout-detection")]
|
|
517
|
+
layout_images,
|
|
518
|
+
#[cfg(feature = "layout-detection")]
|
|
519
|
+
layout_results,
|
|
520
|
+
#[cfg(feature = "layout-detection")]
|
|
521
|
+
table_model: config.layout.as_ref().map(|l| l.table_model).unwrap_or_default(),
|
|
522
|
+
#[cfg(feature = "layout-detection")]
|
|
523
|
+
acceleration: config.acceleration.as_ref(),
|
|
506
524
|
},
|
|
507
525
|
) {
|
|
508
526
|
Ok(structured_doc) if !structured_doc.elements.is_empty() => {
|
|
@@ -1192,7 +1192,14 @@ impl PdfExtractor {
|
|
|
1192
1192
|
#[cfg(feature = "layout-detection")]
|
|
1193
1193
|
let layout_bundle = run_layout_detection(content, config);
|
|
1194
1194
|
#[cfg(feature = "layout-detection")]
|
|
1195
|
-
let layout_hints = layout_bundle
|
|
1195
|
+
let (layout_hints, layout_images, layout_results) = match layout_bundle {
|
|
1196
|
+
Some(ref bundle) => (
|
|
1197
|
+
Some(bundle.hints.as_slice()),
|
|
1198
|
+
Some(bundle.images.as_slice()),
|
|
1199
|
+
Some(bundle.results.as_slice()),
|
|
1200
|
+
),
|
|
1201
|
+
None => (None, None, None),
|
|
1202
|
+
};
|
|
1196
1203
|
#[cfg(not(feature = "layout-detection"))]
|
|
1197
1204
|
let layout_hints: Option<&[Vec<crate::pdf::structure::types::LayoutHint>]> = None;
|
|
1198
1205
|
|
|
@@ -1206,7 +1213,19 @@ impl PdfExtractor {
|
|
|
1206
1213
|
pre_rendered_doc,
|
|
1207
1214
|
_has_font_encoding_issues,
|
|
1208
1215
|
pdf_annotations,
|
|
1209
|
-
) = extract_all_from_oxide_document(
|
|
1216
|
+
) = extract_all_from_oxide_document(
|
|
1217
|
+
content,
|
|
1218
|
+
config,
|
|
1219
|
+
layout_hints,
|
|
1220
|
+
#[cfg(feature = "layout-detection")]
|
|
1221
|
+
layout_images,
|
|
1222
|
+
#[cfg(not(feature = "layout-detection"))]
|
|
1223
|
+
None,
|
|
1224
|
+
#[cfg(feature = "layout-detection")]
|
|
1225
|
+
layout_results,
|
|
1226
|
+
#[cfg(not(feature = "layout-detection"))]
|
|
1227
|
+
None,
|
|
1228
|
+
)?;
|
|
1210
1229
|
|
|
1211
1230
|
// --- OCR evaluation (reuses the same logic as the pdfium path) ---
|
|
1212
1231
|
#[cfg(feature = "ocr")]
|
|
@@ -136,7 +136,12 @@ impl DocumentExtractor for PptExtractor {
|
|
|
136
136
|
}
|
|
137
137
|
|
|
138
138
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
139
|
-
|
|
139
|
+
{
|
|
140
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
141
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
142
|
+
}
|
|
143
|
+
crate::extraction::ppt::extract_ppt_text_with_options(content, include_master_slides)
|
|
144
|
+
}
|
|
140
145
|
}?;
|
|
141
146
|
|
|
142
147
|
let mut metadata_map = AHashMap::new();
|
|
@@ -90,28 +90,26 @@ fn find_sentence_end(text: &str, from: usize) -> Option<usize> {
|
|
|
90
90
|
}
|
|
91
91
|
|
|
92
92
|
// Look for sentence terminals: . ! ?
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
// Consume closing quotes/brackets after terminal
|
|
104
|
-
while end < bytes.len() && matches!(bytes[end], b'"' | b'\'' | b')' | b']' | b'}') {
|
|
105
|
-
end += 1;
|
|
106
|
-
}
|
|
93
|
+
{
|
|
94
|
+
let offset = memchr3(b'.', b'!', b'?', &bytes[pos..])?;
|
|
95
|
+
let terminal_pos = pos + offset;
|
|
96
|
+
// Consume consecutive terminals (e.g., "..." or "?!")
|
|
97
|
+
let mut end = terminal_pos + 1;
|
|
98
|
+
while end < bytes.len() && (bytes[end] == b'.' || bytes[end] == b'!' || bytes[end] == b'?') {
|
|
99
|
+
end += 1;
|
|
100
|
+
}
|
|
107
101
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
102
|
+
// Consume closing quotes/brackets after terminal
|
|
103
|
+
while end < bytes.len() && matches!(bytes[end], b'"' | b'\'' | b')' | b']' | b'}') {
|
|
104
|
+
end += 1;
|
|
105
|
+
}
|
|
112
106
|
|
|
113
|
-
|
|
107
|
+
// Check if this is a real sentence boundary
|
|
108
|
+
if is_sentence_boundary(text, terminal_pos, end) {
|
|
109
|
+
return Some(end);
|
|
114
110
|
}
|
|
111
|
+
|
|
112
|
+
pos = end;
|
|
115
113
|
}
|
|
116
114
|
}
|
|
117
115
|
|
|
@@ -9,8 +9,18 @@ use std::path::{Path, PathBuf};
|
|
|
9
9
|
use crate::layout::error::LayoutError;
|
|
10
10
|
use crate::model_download;
|
|
11
11
|
|
|
12
|
+
#[cfg(feature = "paddle-ocr")]
|
|
12
13
|
use crate::paddle_ocr::ModelManifestEntry;
|
|
13
14
|
|
|
15
|
+
#[cfg(not(feature = "paddle-ocr"))]
|
|
16
|
+
#[derive(Debug, Clone, serde::Serialize)]
|
|
17
|
+
pub struct ModelManifestEntry {
|
|
18
|
+
pub relative_path: String,
|
|
19
|
+
pub sha256: String,
|
|
20
|
+
pub size_bytes: u64,
|
|
21
|
+
pub source_url: String,
|
|
22
|
+
}
|
|
23
|
+
|
|
14
24
|
/// Model definition for a layout model.
|
|
15
25
|
#[derive(Debug, Clone)]
|
|
16
26
|
struct ModelDefinition {
|
|
@@ -320,12 +320,12 @@ pub fn extract_common_metadata_from_document(document: &PdfDocument<'_>) -> Resu
|
|
|
320
320
|
let authors = metadata_cache[2]
|
|
321
321
|
.as_ref()
|
|
322
322
|
.map(|author_str| parse_authors(author_str))
|
|
323
|
-
.
|
|
323
|
+
.filter(|parsed| !parsed.is_empty());
|
|
324
324
|
|
|
325
325
|
let keywords = metadata_cache[3]
|
|
326
326
|
.as_ref()
|
|
327
327
|
.map(|keywords_str| parse_keywords(keywords_str))
|
|
328
|
-
.
|
|
328
|
+
.filter(|parsed| !parsed.is_empty());
|
|
329
329
|
|
|
330
330
|
let created_at = metadata_cache[4].as_ref().map(|date_str| parse_pdf_date(date_str));
|
|
331
331
|
|
|
@@ -94,11 +94,11 @@ fn extract_common_metadata(doc: &mut OxideDocument) -> Result<CommonPdfMetadata>
|
|
|
94
94
|
|
|
95
95
|
let authors = get_info_string(&mut doc.doc, "Author")
|
|
96
96
|
.map(|author_str| parse_authors(&author_str))
|
|
97
|
-
.
|
|
97
|
+
.filter(|parsed| !parsed.is_empty());
|
|
98
98
|
|
|
99
99
|
let keywords = get_info_string(&mut doc.doc, "Keywords")
|
|
100
100
|
.map(|kw_str| parse_keywords(&kw_str))
|
|
101
|
-
.
|
|
101
|
+
.filter(|parsed| !parsed.is_empty());
|
|
102
102
|
|
|
103
103
|
let created_at = get_info_string(&mut doc.doc, "CreationDate").map(|d| parse_pdf_date(&d));
|
|
104
104
|
let modified_at = get_info_string(&mut doc.doc, "ModDate").map(|d| parse_pdf_date(&d));
|
|
@@ -75,9 +75,7 @@ pub(crate) fn extract_tables_native(doc: &mut OxideDocument) -> Result<Vec<Table
|
|
|
75
75
|
///
|
|
76
76
|
/// Maps rows/cells from the native table structure to a 2D `Vec<Vec<String>>`
|
|
77
77
|
/// grid and builds a markdown representation with proper header separators.
|
|
78
|
-
fn convert_extracted_table(
|
|
79
|
-
table: &pdf_oxide::structure::table_extractor::ExtractedTable,
|
|
80
|
-
) -> (Vec<Vec<String>>, String) {
|
|
78
|
+
fn convert_extracted_table(table: &pdf_oxide::structure::table_extractor::Table) -> (Vec<Vec<String>>, String) {
|
|
81
79
|
let mut cells: Vec<Vec<String>> = Vec::with_capacity(table.rows.len());
|
|
82
80
|
let mut markdown = String::new();
|
|
83
81
|
let mut found_header = false;
|
|
@@ -124,7 +122,7 @@ mod tests {
|
|
|
124
122
|
|
|
125
123
|
#[test]
|
|
126
124
|
fn test_convert_extracted_table_basic() {
|
|
127
|
-
use pdf_oxide::structure::table_extractor::{ExtractedTable, TableCell, TableRow};
|
|
125
|
+
use pdf_oxide::structure::table_extractor::{Table as ExtractedTable, TableCell, TableRow};
|
|
128
126
|
|
|
129
127
|
let table = ExtractedTable {
|
|
130
128
|
rows: vec![
|
|
@@ -191,7 +189,7 @@ mod tests {
|
|
|
191
189
|
|
|
192
190
|
#[test]
|
|
193
191
|
fn test_convert_extracted_table_no_header() {
|
|
194
|
-
use pdf_oxide::structure::table_extractor::{ExtractedTable, TableCell, TableRow};
|
|
192
|
+
use pdf_oxide::structure::table_extractor::{Table as ExtractedTable, TableCell, TableRow};
|
|
195
193
|
|
|
196
194
|
let table = ExtractedTable {
|
|
197
195
|
rows: vec![
|
|
@@ -233,7 +231,7 @@ mod tests {
|
|
|
233
231
|
|
|
234
232
|
#[test]
|
|
235
233
|
fn test_convert_extracted_table_empty() {
|
|
236
|
-
use pdf_oxide::structure::table_extractor::ExtractedTable;
|
|
234
|
+
use pdf_oxide::structure::table_extractor::Table as ExtractedTable;
|
|
237
235
|
|
|
238
236
|
let table = ExtractedTable {
|
|
239
237
|
rows: vec![],
|