kreuzberg 4.9.1 → 4.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +3 -3
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/ocr.rs +33 -35
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -3
- data/vendor/kreuzberg/src/extractors/doc.rs +6 -1
- data/vendor/kreuzberg/src/extractors/excel.rs +3 -0
- data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +6 -1
- data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +6 -1
- data/vendor/kreuzberg/src/extractors/iwork/pages.rs +6 -1
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +18 -0
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +21 -2
- data/vendor/kreuzberg/src/extractors/ppt.rs +6 -1
- data/vendor/kreuzberg/src/layout/model_manager.rs +10 -0
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +625 -58
- data/vendor/kreuzberg/src/pdf/structure/regions/tables.rs +24 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +4 -4
- data/vendor/kreuzberg-ffi/src/error.rs +9 -8
- data/vendor/kreuzberg-ffi/tests/c/test_error.c +4 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 431f2a7c19d9a4404369b7c9fe80fd10233b34b3462cf3c0076aac9b7f3065cb
|
|
4
|
+
data.tar.gz: bf36fd9dd46ffb84374e8f64ffb1f61ac6774cc8dd56d43d404e0b8995813ffc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2cf3ad1e3edd0778537e542bcafc121b6cd14235e271d2b9b27747f7beab7a2899362426f450a06f9ff0d0d4365982acb68300e46568887a871ad1bda6767d00
|
|
7
|
+
data.tar.gz: e7bcbe73f50701fbd4a0cb7e75e588096c02202198b234ba6f830b846f4fabd95e051ed0c25ed4a1c1d72a933cf5ccf9d755129eb1dc54ea446a60d9d9946997
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -2916,7 +2916,7 @@ dependencies = [
|
|
|
2916
2916
|
|
|
2917
2917
|
[[package]]
|
|
2918
2918
|
name = "kreuzberg-rb"
|
|
2919
|
-
version = "4.9.
|
|
2919
|
+
version = "4.9.2"
|
|
2920
2920
|
dependencies = [
|
|
2921
2921
|
"async-trait",
|
|
2922
2922
|
"html-to-markdown-rs",
|
|
@@ -3040,9 +3040,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
|
|
|
3040
3040
|
|
|
3041
3041
|
[[package]]
|
|
3042
3042
|
name = "liter-llm"
|
|
3043
|
-
version = "1.2.
|
|
3043
|
+
version = "1.2.2"
|
|
3044
3044
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3045
|
-
checksum = "
|
|
3045
|
+
checksum = "4e4ce5d2d0b09f2e63537ba40b15b0a95c2d6818ed0454eb04d9593ba4a0cad3"
|
|
3046
3046
|
dependencies = [
|
|
3047
3047
|
"base64 0.22.1",
|
|
3048
3048
|
"bytes",
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.9.
|
|
5
|
+
version = "4.9.2"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.2.5", default-features = false }
|
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.9.2", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.2" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.185"
|
|
39
39
|
liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.9.
|
|
21
|
+
> **🚀 Version 4.9.2 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -323,9 +323,12 @@ impl OcrConfig {
|
|
|
323
323
|
/// Returns the effective pipeline config.
|
|
324
324
|
///
|
|
325
325
|
/// - If `pipeline` is explicitly set, returns it.
|
|
326
|
-
/// - If `paddle-ocr`
|
|
327
|
-
/// auto-constructs
|
|
328
|
-
/// - Otherwise returns `None` (single-backend mode
|
|
326
|
+
/// - If `paddle-ocr` is compiled in and the backend is the default
|
|
327
|
+
/// (tesseract), auto-constructs `[tesseract @ 100, paddleocr @ 50]`.
|
|
328
|
+
/// - Otherwise returns `None` (single-backend mode).
|
|
329
|
+
///
|
|
330
|
+
/// Explicit non-default backend selections are honored as-is — a silent
|
|
331
|
+
/// paddleocr fallback would mask errors from the chosen backend.
|
|
329
332
|
pub fn effective_pipeline(&self) -> Option<OcrPipelineConfig> {
|
|
330
333
|
if self.pipeline.is_some() {
|
|
331
334
|
return self.pipeline.clone();
|
|
@@ -333,25 +336,28 @@ impl OcrConfig {
|
|
|
333
336
|
|
|
334
337
|
#[cfg(feature = "paddle-ocr")]
|
|
335
338
|
{
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
339
|
+
if self.backend != default_tesseract_backend() {
|
|
340
|
+
return None;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
let stages = vec![
|
|
344
|
+
OcrPipelineStage {
|
|
345
|
+
backend: self.backend.clone(),
|
|
346
|
+
priority: 100,
|
|
347
|
+
language: None,
|
|
348
|
+
tesseract_config: self.tesseract_config.clone(),
|
|
349
|
+
paddle_ocr_config: None,
|
|
350
|
+
vlm_config: self.vlm_config.clone(),
|
|
351
|
+
},
|
|
352
|
+
OcrPipelineStage {
|
|
347
353
|
backend: "paddleocr".to_string(),
|
|
348
354
|
priority: 50,
|
|
349
355
|
language: None,
|
|
350
356
|
tesseract_config: None,
|
|
351
357
|
paddle_ocr_config: self.paddle_ocr_config.clone(),
|
|
352
358
|
vlm_config: None,
|
|
353
|
-
}
|
|
354
|
-
|
|
359
|
+
},
|
|
360
|
+
];
|
|
355
361
|
Some(OcrPipelineConfig {
|
|
356
362
|
stages,
|
|
357
363
|
quality_thresholds: self.effective_thresholds(),
|
|
@@ -485,29 +491,21 @@ mod tests {
|
|
|
485
491
|
}
|
|
486
492
|
|
|
487
493
|
#[test]
|
|
488
|
-
fn
|
|
489
|
-
// When primary backend is "paddleocr", effective_pipeline should NOT add
|
|
490
|
-
// a second paddleocr stage (issue #6 fix).
|
|
494
|
+
fn test_effective_pipeline_explicit_paddleocr_no_autofallback() {
|
|
491
495
|
let config = OcrConfig {
|
|
492
496
|
backend: "paddleocr".to_string(),
|
|
493
497
|
..Default::default()
|
|
494
498
|
};
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
);
|
|
506
|
-
}
|
|
507
|
-
#[cfg(not(feature = "paddle-ocr"))]
|
|
508
|
-
{
|
|
509
|
-
assert!(result.is_none());
|
|
510
|
-
}
|
|
499
|
+
assert!(config.effective_pipeline().is_none());
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
#[test]
|
|
503
|
+
fn test_effective_pipeline_explicit_easyocr_no_autofallback() {
|
|
504
|
+
let config = OcrConfig {
|
|
505
|
+
backend: "easyocr".to_string(),
|
|
506
|
+
..Default::default()
|
|
507
|
+
};
|
|
508
|
+
assert!(config.effective_pipeline().is_none());
|
|
511
509
|
}
|
|
512
510
|
|
|
513
511
|
#[test]
|
|
@@ -282,9 +282,7 @@ fn extract_pptx_from_container<R: std::io::Read + std::io::Seek>(
|
|
|
282
282
|
}),
|
|
283
283
|
});
|
|
284
284
|
|
|
285
|
-
let document = doc_builder
|
|
286
|
-
.map(|b| b.build())
|
|
287
|
-
.filter(|d| !d.is_empty());
|
|
285
|
+
let document = doc_builder.map(|b| b.build()).filter(|d| !d.is_empty());
|
|
288
286
|
|
|
289
287
|
Ok(PptxExtractionResult {
|
|
290
288
|
content,
|
|
@@ -85,7 +85,12 @@ impl DocumentExtractor for DocExtractor {
|
|
|
85
85
|
}
|
|
86
86
|
|
|
87
87
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
88
|
-
|
|
88
|
+
{
|
|
89
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
90
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
91
|
+
}
|
|
92
|
+
extract_doc_text(content)
|
|
93
|
+
}
|
|
89
94
|
}?;
|
|
90
95
|
|
|
91
96
|
let mut doc = InternalDocument::new("doc");
|
|
@@ -215,6 +215,9 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
215
215
|
}
|
|
216
216
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
217
217
|
{
|
|
218
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
219
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
220
|
+
}
|
|
218
221
|
crate::extraction::excel::read_excel_bytes(content, extension)?
|
|
219
222
|
}
|
|
220
223
|
};
|
|
@@ -172,7 +172,12 @@ impl DocumentExtractor for KeynoteExtractor {
|
|
|
172
172
|
}
|
|
173
173
|
|
|
174
174
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
175
|
-
|
|
175
|
+
{
|
|
176
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
177
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
178
|
+
}
|
|
179
|
+
parse_keynote(content)?
|
|
180
|
+
}
|
|
176
181
|
};
|
|
177
182
|
|
|
178
183
|
let mut doc = build_keynote_internal_document(&data);
|
|
@@ -181,7 +181,12 @@ impl DocumentExtractor for NumbersExtractor {
|
|
|
181
181
|
}
|
|
182
182
|
|
|
183
183
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
184
|
-
|
|
184
|
+
{
|
|
185
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
186
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
187
|
+
}
|
|
188
|
+
parse_numbers(content)?
|
|
189
|
+
}
|
|
185
190
|
};
|
|
186
191
|
|
|
187
192
|
let mut doc = build_numbers_internal_document(&data);
|
|
@@ -163,7 +163,12 @@ impl DocumentExtractor for PagesExtractor {
|
|
|
163
163
|
}
|
|
164
164
|
|
|
165
165
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
166
|
-
|
|
166
|
+
{
|
|
167
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
168
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
169
|
+
}
|
|
170
|
+
parse_pages(content)?
|
|
171
|
+
}
|
|
167
172
|
};
|
|
168
173
|
|
|
169
174
|
let mut doc = build_pages_internal_document(&data);
|
|
@@ -420,6 +420,10 @@ pub(crate) fn extract_all_from_oxide_document(
|
|
|
420
420
|
content: &[u8],
|
|
421
421
|
config: &ExtractionConfig,
|
|
422
422
|
layout_hints: Option<&[Vec<crate::pdf::structure::types::LayoutHint>]>,
|
|
423
|
+
#[cfg(feature = "layout-detection")] layout_images: Option<&[image::DynamicImage]>,
|
|
424
|
+
#[cfg(not(feature = "layout-detection"))] _layout_images: Option<()>,
|
|
425
|
+
#[cfg(feature = "layout-detection")] layout_results: Option<&[crate::pdf::layout_runner::PageLayoutResult]>,
|
|
426
|
+
#[cfg(not(feature = "layout-detection"))] _layout_results: Option<()>,
|
|
423
427
|
) -> Result<PdfExtractionPhaseResult> {
|
|
424
428
|
let _span = tracing::debug_span!("extract_pdf_oxide").entered();
|
|
425
429
|
|
|
@@ -460,6 +464,11 @@ pub(crate) fn extract_all_from_oxide_document(
|
|
|
460
464
|
OutputFormat::Markdown | OutputFormat::Djot | OutputFormat::Html
|
|
461
465
|
);
|
|
462
466
|
|
|
467
|
+
let allow_single_column = config
|
|
468
|
+
.pdf_options
|
|
469
|
+
.as_ref()
|
|
470
|
+
.is_some_and(|o| o.allow_single_column_tables);
|
|
471
|
+
|
|
463
472
|
let pre_rendered_doc =
|
|
464
473
|
if needs_structured && !config.force_ocr {
|
|
465
474
|
let k = config
|
|
@@ -503,6 +512,15 @@ pub(crate) fn extract_all_from_oxide_document(
|
|
|
503
512
|
used_structure_tree,
|
|
504
513
|
image_positions: &image_positions,
|
|
505
514
|
layout_hints,
|
|
515
|
+
allow_single_column,
|
|
516
|
+
#[cfg(feature = "layout-detection")]
|
|
517
|
+
layout_images,
|
|
518
|
+
#[cfg(feature = "layout-detection")]
|
|
519
|
+
layout_results,
|
|
520
|
+
#[cfg(feature = "layout-detection")]
|
|
521
|
+
table_model: config.layout.as_ref().map(|l| l.table_model).unwrap_or_default(),
|
|
522
|
+
#[cfg(feature = "layout-detection")]
|
|
523
|
+
acceleration: config.acceleration.as_ref(),
|
|
506
524
|
},
|
|
507
525
|
) {
|
|
508
526
|
Ok(structured_doc) if !structured_doc.elements.is_empty() => {
|
|
@@ -1192,7 +1192,14 @@ impl PdfExtractor {
|
|
|
1192
1192
|
#[cfg(feature = "layout-detection")]
|
|
1193
1193
|
let layout_bundle = run_layout_detection(content, config);
|
|
1194
1194
|
#[cfg(feature = "layout-detection")]
|
|
1195
|
-
let layout_hints = layout_bundle
|
|
1195
|
+
let (layout_hints, layout_images, layout_results) = match layout_bundle {
|
|
1196
|
+
Some(ref bundle) => (
|
|
1197
|
+
Some(bundle.hints.as_slice()),
|
|
1198
|
+
Some(bundle.images.as_slice()),
|
|
1199
|
+
Some(bundle.results.as_slice()),
|
|
1200
|
+
),
|
|
1201
|
+
None => (None, None, None),
|
|
1202
|
+
};
|
|
1196
1203
|
#[cfg(not(feature = "layout-detection"))]
|
|
1197
1204
|
let layout_hints: Option<&[Vec<crate::pdf::structure::types::LayoutHint>]> = None;
|
|
1198
1205
|
|
|
@@ -1206,7 +1213,19 @@ impl PdfExtractor {
|
|
|
1206
1213
|
pre_rendered_doc,
|
|
1207
1214
|
_has_font_encoding_issues,
|
|
1208
1215
|
pdf_annotations,
|
|
1209
|
-
) = extract_all_from_oxide_document(
|
|
1216
|
+
) = extract_all_from_oxide_document(
|
|
1217
|
+
content,
|
|
1218
|
+
config,
|
|
1219
|
+
layout_hints,
|
|
1220
|
+
#[cfg(feature = "layout-detection")]
|
|
1221
|
+
layout_images,
|
|
1222
|
+
#[cfg(not(feature = "layout-detection"))]
|
|
1223
|
+
None,
|
|
1224
|
+
#[cfg(feature = "layout-detection")]
|
|
1225
|
+
layout_results,
|
|
1226
|
+
#[cfg(not(feature = "layout-detection"))]
|
|
1227
|
+
None,
|
|
1228
|
+
)?;
|
|
1210
1229
|
|
|
1211
1230
|
// --- OCR evaluation (reuses the same logic as the pdfium path) ---
|
|
1212
1231
|
#[cfg(feature = "ocr")]
|
|
@@ -136,7 +136,12 @@ impl DocumentExtractor for PptExtractor {
|
|
|
136
136
|
}
|
|
137
137
|
|
|
138
138
|
#[cfg(not(feature = "tokio-runtime"))]
|
|
139
|
-
|
|
139
|
+
{
|
|
140
|
+
if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
|
|
141
|
+
return Err(crate::error::KreuzbergError::Cancelled);
|
|
142
|
+
}
|
|
143
|
+
crate::extraction::ppt::extract_ppt_text_with_options(content, include_master_slides)
|
|
144
|
+
}
|
|
140
145
|
}?;
|
|
141
146
|
|
|
142
147
|
let mut metadata_map = AHashMap::new();
|
|
@@ -9,8 +9,18 @@ use std::path::{Path, PathBuf};
|
|
|
9
9
|
use crate::layout::error::LayoutError;
|
|
10
10
|
use crate::model_download;
|
|
11
11
|
|
|
12
|
+
#[cfg(feature = "paddle-ocr")]
|
|
12
13
|
use crate::paddle_ocr::ModelManifestEntry;
|
|
13
14
|
|
|
15
|
+
#[cfg(not(feature = "paddle-ocr"))]
|
|
16
|
+
#[derive(Debug, Clone, serde::Serialize)]
|
|
17
|
+
pub struct ModelManifestEntry {
|
|
18
|
+
pub relative_path: String,
|
|
19
|
+
pub sha256: String,
|
|
20
|
+
pub size_bytes: u64,
|
|
21
|
+
pub source_url: String,
|
|
22
|
+
}
|
|
23
|
+
|
|
14
24
|
/// Model definition for a layout model.
|
|
15
25
|
#[derive(Debug, Clone)]
|
|
16
26
|
struct ModelDefinition {
|