kreuzberg 4.6.0 → 4.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/src/config/types.rs +18 -0
- data/lib/kreuzberg/config.rb +22 -8
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +5 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +1 -0
- data/vendor/kreuzberg/src/api/openapi.rs +6 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +28 -0
- data/vendor/kreuzberg/src/core/config/extraction/file_config.rs +12 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +23 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -0
- data/vendor/kreuzberg/src/error.rs +3 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -0
- data/vendor/kreuzberg/src/extraction/pst.rs +386 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +214 -1
- data/vendor/kreuzberg/src/extraction/transform/content.rs +40 -7
- data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +69 -12
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +159 -6
- data/vendor/kreuzberg/src/extractors/email.rs +3 -3
- data/vendor/kreuzberg/src/extractors/mod.rs +12 -2
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +121 -52
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +536 -194
- data/vendor/kreuzberg/src/extractors/pst.rs +264 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +14 -4
- data/vendor/kreuzberg/src/mcp/errors.rs +5 -0
- data/vendor/kreuzberg/src/pdf/layout_runner.rs +214 -226
- data/vendor/kreuzberg/src/pdf/text.rs +41 -2
- data/vendor/kreuzberg/test_documents/jsonl/simple.jsonl +3 -0
- data/vendor/kreuzberg/test_documents/jsonl/with_blanks.jsonl +5 -0
- data/vendor/kreuzberg/tests/api_consistency.rs +7 -0
- data/vendor/kreuzberg/tests/jsonl_integration.rs +82 -0
- data/vendor/kreuzberg/tests/pst_integration.rs +82 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +24 -2
- data/vendor/kreuzberg-ffi/src/config/merge.rs +7 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +37 -0
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +7 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7f2441d44e9083d36f9f0e60b2c155624ad251f97c6f2ca629bd3ef33aeeeb46
|
|
4
|
+
data.tar.gz: b443884a5e4bfffa1bc916928ae97997e1ca21cdfb80e76b845c454c0aa67256
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e238336b3ceae6d2bed4bd530d393f32473a7a88f68f7e28a6300ffe2f7cadc5aee0ac834fa5712223810f9cf1f7a68a363b7e96079060565e1c9e10af9f8114
|
|
7
|
+
data.tar.gz: 48eae2f77c78ef1b9794ac3dde62c785b32ab99253b6ab7ce8da9d8bba8a57f209fb5fc6353a0bc8f5771bd825fc3b3b29652149133bd0ba1dc29e58101185bb
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.1" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -875,6 +875,17 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
|
|
|
875
875
|
config.force_ocr = bool::try_convert(val)?;
|
|
876
876
|
}
|
|
877
877
|
|
|
878
|
+
if let Some(val) = get_kw(ruby, hash, "force_ocr_pages")
|
|
879
|
+
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
880
|
+
{
|
|
881
|
+
let pages_array = magnus::RArray::try_convert(val)?;
|
|
882
|
+
let pages: Vec<usize> = pages_array
|
|
883
|
+
.into_iter()
|
|
884
|
+
.map(|v| usize::try_convert(v))
|
|
885
|
+
.collect::<Result<Vec<_>, _>>()?;
|
|
886
|
+
config.force_ocr_pages = Some(pages);
|
|
887
|
+
}
|
|
888
|
+
|
|
878
889
|
if let Some(val) = get_kw(ruby, hash, "include_document_structure") {
|
|
879
890
|
config.include_document_structure = bool::try_convert(val)?;
|
|
880
891
|
}
|
|
@@ -1013,6 +1024,13 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
|
|
|
1013
1024
|
}
|
|
1014
1025
|
};
|
|
1015
1026
|
}
|
|
1027
|
+
|
|
1028
|
+
if let Some(val) = get_kw(ruby, hash, "extraction_timeout_secs")
|
|
1029
|
+
&& val.equal(ruby.qnil()).ok() != Some(true)
|
|
1030
|
+
{
|
|
1031
|
+
let secs = u64::try_convert(val)?;
|
|
1032
|
+
config.extraction_timeout_secs = Some(secs);
|
|
1033
|
+
}
|
|
1016
1034
|
}
|
|
1017
1035
|
|
|
1018
1036
|
Ok(config)
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -926,14 +926,14 @@ module Kreuzberg
|
|
|
926
926
|
# )
|
|
927
927
|
#
|
|
928
928
|
class Extraction
|
|
929
|
-
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
929
|
+
attr_reader :use_cache, :enable_quality_processing, :force_ocr, :force_ocr_pages,
|
|
930
930
|
:include_document_structure,
|
|
931
931
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
932
932
|
:images, :postprocessor,
|
|
933
933
|
:token_reduction, :keywords, :html_options, :pages,
|
|
934
934
|
:max_concurrent_extractions, :output_format, :result_format,
|
|
935
935
|
:security_limits, :layout, :concurrency,
|
|
936
|
-
:cache_namespace, :cache_ttl_secs
|
|
936
|
+
:cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
|
|
937
937
|
|
|
938
938
|
# Alias for backward compatibility - image_extraction is the canonical name
|
|
939
939
|
alias image_extraction images
|
|
@@ -954,11 +954,11 @@ module Kreuzberg
|
|
|
954
954
|
#
|
|
955
955
|
# Keys that are allowed in the Extraction config
|
|
956
956
|
ALLOWED_KEYS = %i[
|
|
957
|
-
use_cache enable_quality_processing force_ocr include_document_structure ocr chunking
|
|
957
|
+
use_cache enable_quality_processing force_ocr force_ocr_pages include_document_structure ocr chunking
|
|
958
958
|
language_detection pdf_options image_extraction
|
|
959
959
|
postprocessor token_reduction keywords html_options pages
|
|
960
960
|
max_concurrent_extractions output_format result_format
|
|
961
|
-
security_limits layout concurrency cache_namespace cache_ttl_secs
|
|
961
|
+
security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
|
|
962
962
|
].freeze
|
|
963
963
|
|
|
964
964
|
# Aliases for backward compatibility
|
|
@@ -1019,6 +1019,7 @@ module Kreuzberg
|
|
|
1019
1019
|
use_cache: true,
|
|
1020
1020
|
enable_quality_processing: true,
|
|
1021
1021
|
force_ocr: false,
|
|
1022
|
+
force_ocr_pages: nil,
|
|
1022
1023
|
include_document_structure: false,
|
|
1023
1024
|
ocr: nil,
|
|
1024
1025
|
chunking: nil,
|
|
@@ -1037,10 +1038,12 @@ module Kreuzberg
|
|
|
1037
1038
|
layout: nil,
|
|
1038
1039
|
concurrency: nil,
|
|
1039
1040
|
cache_namespace: nil,
|
|
1040
|
-
cache_ttl_secs: nil
|
|
1041
|
+
cache_ttl_secs: nil,
|
|
1042
|
+
extraction_timeout_secs: nil)
|
|
1041
1043
|
kwargs = {
|
|
1042
1044
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
1043
|
-
force_ocr: force_ocr,
|
|
1045
|
+
force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
|
|
1046
|
+
include_document_structure: include_document_structure,
|
|
1044
1047
|
ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
1045
1048
|
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
1046
1049
|
postprocessor: postprocessor,
|
|
@@ -1050,7 +1053,8 @@ module Kreuzberg
|
|
|
1050
1053
|
security_limits: security_limits, layout: layout,
|
|
1051
1054
|
concurrency: concurrency,
|
|
1052
1055
|
cache_namespace: cache_namespace,
|
|
1053
|
-
cache_ttl_secs: cache_ttl_secs
|
|
1056
|
+
cache_ttl_secs: cache_ttl_secs,
|
|
1057
|
+
extraction_timeout_secs: extraction_timeout_secs
|
|
1054
1058
|
}
|
|
1055
1059
|
extracted = extract_from_hash(hash, kwargs)
|
|
1056
1060
|
|
|
@@ -1068,6 +1072,7 @@ module Kreuzberg
|
|
|
1068
1072
|
@use_cache = params[:use_cache] ? true : false
|
|
1069
1073
|
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
1070
1074
|
@force_ocr = params[:force_ocr] ? true : false
|
|
1075
|
+
@force_ocr_pages = params[:force_ocr_pages]
|
|
1071
1076
|
@include_document_structure = params[:include_document_structure] ? true : false
|
|
1072
1077
|
@ocr = normalize_config(params[:ocr], OCR)
|
|
1073
1078
|
@chunking = normalize_config(params[:chunking], Chunking)
|
|
@@ -1086,6 +1091,7 @@ module Kreuzberg
|
|
|
1086
1091
|
@result_format = validate_result_format(params[:result_format])
|
|
1087
1092
|
@cache_namespace = params[:cache_namespace]
|
|
1088
1093
|
@cache_ttl_secs = params[:cache_ttl_secs]&.to_i
|
|
1094
|
+
@extraction_timeout_secs = params[:extraction_timeout_secs]&.to_i
|
|
1089
1095
|
@security_limits = params[:security_limits]
|
|
1090
1096
|
end
|
|
1091
1097
|
|
|
@@ -1118,12 +1124,14 @@ module Kreuzberg
|
|
|
1118
1124
|
use_cache: @use_cache,
|
|
1119
1125
|
enable_quality_processing: @enable_quality_processing,
|
|
1120
1126
|
force_ocr: @force_ocr,
|
|
1127
|
+
force_ocr_pages: @force_ocr_pages,
|
|
1121
1128
|
include_document_structure: @include_document_structure,
|
|
1122
1129
|
max_concurrent_extractions: @max_concurrent_extractions,
|
|
1123
1130
|
output_format: @output_format,
|
|
1124
1131
|
result_format: @result_format,
|
|
1125
1132
|
cache_namespace: @cache_namespace,
|
|
1126
|
-
cache_ttl_secs: @cache_ttl_secs
|
|
1133
|
+
cache_ttl_secs: @cache_ttl_secs,
|
|
1134
|
+
extraction_timeout_secs: @extraction_timeout_secs
|
|
1127
1135
|
}
|
|
1128
1136
|
end
|
|
1129
1137
|
|
|
@@ -1250,6 +1258,8 @@ module Kreuzberg
|
|
|
1250
1258
|
@enable_quality_processing = value ? true : false
|
|
1251
1259
|
when :force_ocr
|
|
1252
1260
|
@force_ocr = value ? true : false
|
|
1261
|
+
when :force_ocr_pages
|
|
1262
|
+
@force_ocr_pages = value
|
|
1253
1263
|
when :include_document_structure
|
|
1254
1264
|
@include_document_structure = value ? true : false
|
|
1255
1265
|
when :ocr
|
|
@@ -1286,6 +1296,8 @@ module Kreuzberg
|
|
|
1286
1296
|
@cache_namespace = value
|
|
1287
1297
|
when :cache_ttl_secs
|
|
1288
1298
|
@cache_ttl_secs = value&.to_i
|
|
1299
|
+
when :extraction_timeout_secs
|
|
1300
|
+
@extraction_timeout_secs = value&.to_i
|
|
1289
1301
|
else
|
|
1290
1302
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
1291
1303
|
end
|
|
@@ -1345,6 +1357,7 @@ module Kreuzberg
|
|
|
1345
1357
|
@use_cache = merged.use_cache
|
|
1346
1358
|
@enable_quality_processing = merged.enable_quality_processing
|
|
1347
1359
|
@force_ocr = merged.force_ocr
|
|
1360
|
+
@force_ocr_pages = merged.force_ocr_pages
|
|
1348
1361
|
@include_document_structure = merged.include_document_structure
|
|
1349
1362
|
@ocr = merged.ocr
|
|
1350
1363
|
@chunking = merged.chunking
|
|
@@ -1369,6 +1382,7 @@ module Kreuzberg
|
|
|
1369
1382
|
@result_format = merged.result_format
|
|
1370
1383
|
@cache_namespace = merged.cache_namespace
|
|
1371
1384
|
@cache_ttl_secs = merged.cache_ttl_secs
|
|
1385
|
+
@extraction_timeout_secs = merged.extraction_timeout_secs
|
|
1372
1386
|
end
|
|
1373
1387
|
end
|
|
1374
1388
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -481,7 +481,9 @@ module Kreuzberg
|
|
|
481
481
|
attr_reader enable_quality_processing: bool
|
|
482
482
|
attr_reader cache_namespace: String?
|
|
483
483
|
attr_reader cache_ttl_secs: Integer?
|
|
484
|
+
attr_reader extraction_timeout_secs: Integer?
|
|
484
485
|
attr_reader force_ocr: bool
|
|
486
|
+
attr_reader force_ocr_pages: Array[Integer]?
|
|
485
487
|
attr_reader include_document_structure: bool
|
|
486
488
|
attr_reader ocr: OCR?
|
|
487
489
|
attr_reader chunking: Chunking?
|
|
@@ -508,6 +510,7 @@ module Kreuzberg
|
|
|
508
510
|
?use_cache: bool,
|
|
509
511
|
?enable_quality_processing: bool,
|
|
510
512
|
?force_ocr: bool,
|
|
513
|
+
?force_ocr_pages: Array[Integer]?,
|
|
511
514
|
?include_document_structure: bool,
|
|
512
515
|
?ocr: (OCR | Hash[Symbol, untyped])?,
|
|
513
516
|
?chunking: (Chunking | Hash[Symbol, untyped])?,
|
|
@@ -525,7 +528,8 @@ module Kreuzberg
|
|
|
525
528
|
?output_format: String?,
|
|
526
529
|
?result_format: String?,
|
|
527
530
|
?cache_namespace: String?,
|
|
528
|
-
?cache_ttl_secs: Integer
|
|
531
|
+
?cache_ttl_secs: Integer?,
|
|
532
|
+
?extraction_timeout_secs: Integer?
|
|
529
533
|
) -> void
|
|
530
534
|
def to_h: () -> Hash[Symbol, untyped]
|
|
531
535
|
def to_json: (*untyped) -> String
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.6.
|
|
5
|
+
version = "4.6.1"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -30,8 +30,8 @@ html-to-markdown-rs = { version = "2.29.0", default-features = false }
|
|
|
30
30
|
image = { version = "0.25.10", default-features = false }
|
|
31
31
|
itertools = "0.14"
|
|
32
32
|
js-sys = "0.3"
|
|
33
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.6.
|
|
34
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.
|
|
33
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.6.1", default-features = false }
|
|
34
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.1" }
|
|
35
35
|
lazy_static = "1.5.0"
|
|
36
36
|
libc = "0.2.183"
|
|
37
37
|
log = "0.4"
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.6.
|
|
3
|
+
version = "4.6.1"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -53,7 +53,7 @@ office = [
|
|
|
53
53
|
]
|
|
54
54
|
hwp = ["dep:cfb", "dep:flate2"]
|
|
55
55
|
iwork = ["dep:zip", "dep:snap"]
|
|
56
|
-
email = ["dep:mail-parser", "dep:cfb"]
|
|
56
|
+
email = ["dep:mail-parser", "dep:cfb", "dep:outlook-pst", "dep:tempfile", "dep:chrono"]
|
|
57
57
|
html = ["dep:html-to-markdown-rs"]
|
|
58
58
|
xml = ["dep:quick-xml", "dep:roxmltree"]
|
|
59
59
|
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
|
|
@@ -238,6 +238,7 @@ bytes = { version = "1", features = ["serde"] }
|
|
|
238
238
|
calamine = { version = "0.34.0", features = ["dates"], optional = true }
|
|
239
239
|
cfb = { version = "0.14", optional = true }
|
|
240
240
|
chardetng = { version = "0.1.17", optional = true }
|
|
241
|
+
chrono = { version = "0.4", optional = true }
|
|
241
242
|
dashmap = "6.1"
|
|
242
243
|
dbase = { version = "0.7", optional = true }
|
|
243
244
|
encoding_rs = { version = "0.8.35" }
|
|
@@ -287,6 +288,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
287
288
|
"ndarray",
|
|
288
289
|
"api-18",
|
|
289
290
|
], optional = true }
|
|
291
|
+
outlook-pst = { version = "1.2.0", optional = true }
|
|
290
292
|
parking_lot = "0.12.5"
|
|
291
293
|
pastey = "0.2"
|
|
292
294
|
pdf_oxide = { version = "0.3.17", default-features = false, optional = true }
|
|
@@ -315,6 +317,7 @@ sha2 = { version = "0.10", optional = true }
|
|
|
315
317
|
simdutf8 = { version = "0.1", optional = true }
|
|
316
318
|
snap = { version = "1.1", optional = true }
|
|
317
319
|
tar = { version = "0.4.45", optional = true }
|
|
320
|
+
tempfile = { version = "3.27.0", optional = true }
|
|
318
321
|
text-splitter = { version = "0.29.3", features = ["markdown"], optional = true }
|
|
319
322
|
thiserror = "2.0.18"
|
|
320
323
|
tiff = { version = "0.11", optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.6.
|
|
21
|
+
> **🚀 Version 4.6.1 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -115,6 +115,7 @@ impl ApiError {
|
|
|
115
115
|
KreuzbergError::Plugin { .. } => "PluginError",
|
|
116
116
|
KreuzbergError::LockPoisoned(_) => "LockPoisonedError",
|
|
117
117
|
KreuzbergError::UnsupportedFormat(_) => "UnsupportedFormatError",
|
|
118
|
+
KreuzbergError::Timeout { .. } => "TimeoutError",
|
|
118
119
|
KreuzbergError::Other(_) => "Error",
|
|
119
120
|
};
|
|
120
121
|
|
|
@@ -74,6 +74,12 @@ use utoipa::OpenApi;
|
|
|
74
74
|
crate::types::extraction::ElementId,
|
|
75
75
|
crate::types::extraction::ElementType,
|
|
76
76
|
crate::types::extraction::BoundingBox,
|
|
77
|
+
crate::types::ocr_elements::OcrElement,
|
|
78
|
+
crate::types::ocr_elements::OcrBoundingGeometry,
|
|
79
|
+
crate::types::ocr_elements::OcrConfidence,
|
|
80
|
+
crate::types::ocr_elements::OcrRotation,
|
|
81
|
+
crate::types::ocr_elements::OcrElementLevel,
|
|
82
|
+
crate::types::ocr_elements::OcrElementConfig,
|
|
77
83
|
crate::types::metadata::Metadata,
|
|
78
84
|
crate::types::tables::Table,
|
|
79
85
|
crate::types::page::PageContent,
|
|
@@ -47,6 +47,16 @@ pub struct ExtractionConfig {
|
|
|
47
47
|
#[serde(default)]
|
|
48
48
|
pub force_ocr: bool,
|
|
49
49
|
|
|
50
|
+
/// Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
|
|
51
|
+
///
|
|
52
|
+
/// When set, only the listed pages are OCR'd regardless of text layer quality.
|
|
53
|
+
/// Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
|
|
54
|
+
/// Only applies to PDF documents. Duplicates are automatically deduplicated.
|
|
55
|
+
/// An `ocr` config is recommended for backend/language selection; defaults are used if absent.
|
|
56
|
+
#[serde(default)]
|
|
57
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
58
|
+
pub force_ocr_pages: Option<Vec<usize>>,
|
|
59
|
+
|
|
50
60
|
/// Text chunking configuration (None = chunking disabled)
|
|
51
61
|
#[serde(default)]
|
|
52
62
|
pub chunking: Option<ChunkingConfig>,
|
|
@@ -89,6 +99,14 @@ pub struct ExtractionConfig {
|
|
|
89
99
|
#[serde(default)]
|
|
90
100
|
pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
|
|
91
101
|
|
|
102
|
+
/// Default per-file timeout in seconds for batch extraction.
|
|
103
|
+
///
|
|
104
|
+
/// When set, each file in a batch will be canceled after this duration
|
|
105
|
+
/// unless overridden by [`FileExtractionConfig::timeout_secs`].
|
|
106
|
+
/// `None` means no timeout (unbounded extraction time).
|
|
107
|
+
#[serde(default)]
|
|
108
|
+
pub extraction_timeout_secs: Option<u64>,
|
|
109
|
+
|
|
92
110
|
/// Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
|
|
93
111
|
///
|
|
94
112
|
/// Limits parallelism to prevent resource exhaustion when processing
|
|
@@ -201,6 +219,7 @@ impl Default for ExtractionConfig {
|
|
|
201
219
|
enable_quality_processing: true,
|
|
202
220
|
ocr: None,
|
|
203
221
|
force_ocr: false,
|
|
222
|
+
force_ocr_pages: None,
|
|
204
223
|
chunking: None,
|
|
205
224
|
images: None,
|
|
206
225
|
#[cfg(feature = "pdf")]
|
|
@@ -213,6 +232,7 @@ impl Default for ExtractionConfig {
|
|
|
213
232
|
postprocessor: None,
|
|
214
233
|
#[cfg(feature = "html")]
|
|
215
234
|
html_options: None,
|
|
235
|
+
extraction_timeout_secs: None,
|
|
216
236
|
max_concurrent_extractions: None,
|
|
217
237
|
#[cfg(feature = "archives")]
|
|
218
238
|
security_limits: None,
|
|
@@ -259,6 +279,7 @@ impl ExtractionConfig {
|
|
|
259
279
|
ref enable_quality_processing,
|
|
260
280
|
ref ocr,
|
|
261
281
|
ref force_ocr,
|
|
282
|
+
ref force_ocr_pages,
|
|
262
283
|
ref chunking,
|
|
263
284
|
ref images,
|
|
264
285
|
#[cfg(feature = "pdf")]
|
|
@@ -276,6 +297,7 @@ impl ExtractionConfig {
|
|
|
276
297
|
ref include_document_structure,
|
|
277
298
|
#[cfg(feature = "layout-detection")]
|
|
278
299
|
ref layout,
|
|
300
|
+
ref timeout_secs,
|
|
279
301
|
} = *overrides;
|
|
280
302
|
|
|
281
303
|
let mut config = self.clone();
|
|
@@ -289,6 +311,9 @@ impl ExtractionConfig {
|
|
|
289
311
|
if let Some(v) = force_ocr {
|
|
290
312
|
config.force_ocr = *v;
|
|
291
313
|
}
|
|
314
|
+
if let Some(v) = force_ocr_pages {
|
|
315
|
+
config.force_ocr_pages = Some(v.clone());
|
|
316
|
+
}
|
|
292
317
|
if let Some(v) = chunking {
|
|
293
318
|
config.chunking = Some(v.clone());
|
|
294
319
|
}
|
|
@@ -332,6 +357,9 @@ impl ExtractionConfig {
|
|
|
332
357
|
if let Some(v) = layout {
|
|
333
358
|
config.layout = Some(v.clone());
|
|
334
359
|
}
|
|
360
|
+
if let Some(v) = timeout_secs {
|
|
361
|
+
config.extraction_timeout_secs = Some(*v);
|
|
362
|
+
}
|
|
335
363
|
|
|
336
364
|
config
|
|
337
365
|
}
|
|
@@ -57,6 +57,10 @@ pub struct FileExtractionConfig {
|
|
|
57
57
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
58
58
|
pub force_ocr: Option<bool>,
|
|
59
59
|
|
|
60
|
+
/// Override force OCR pages for this file (1-indexed page numbers).
|
|
61
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
62
|
+
pub force_ocr_pages: Option<Vec<usize>>,
|
|
63
|
+
|
|
60
64
|
/// Override chunking configuration for this file.
|
|
61
65
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
62
66
|
pub chunking: Option<ChunkingConfig>,
|
|
@@ -112,4 +116,12 @@ pub struct FileExtractionConfig {
|
|
|
112
116
|
#[cfg(feature = "layout-detection")]
|
|
113
117
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
114
118
|
pub layout: Option<super::super::layout::LayoutDetectionConfig>,
|
|
119
|
+
|
|
120
|
+
/// Override per-file extraction timeout in seconds.
|
|
121
|
+
///
|
|
122
|
+
/// When set, the extraction for this file will be canceled after the
|
|
123
|
+
/// specified duration. A timed-out file produces an error result without
|
|
124
|
+
/// affecting other files in the batch.
|
|
125
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
126
|
+
pub timeout_secs: Option<u64>,
|
|
115
127
|
}
|
|
@@ -63,11 +63,12 @@ where
|
|
|
63
63
|
Ok(results.into_iter().map(|r| r.unwrap()).collect())
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
-
/// Run a single extraction task with semaphore gating, timing, and batch mode.
|
|
66
|
+
/// Run a single extraction task with semaphore gating, timing, optional timeout, and batch mode.
|
|
67
67
|
#[cfg(feature = "tokio-runtime")]
|
|
68
68
|
async fn run_timed_extraction<F, Fut>(
|
|
69
69
|
index: usize,
|
|
70
70
|
semaphore: Arc<tokio::sync::Semaphore>,
|
|
71
|
+
timeout_secs: Option<u64>,
|
|
71
72
|
extract_fn: F,
|
|
72
73
|
) -> (usize, Result<ExtractionResult>, u64)
|
|
73
74
|
where
|
|
@@ -76,7 +77,23 @@ where
|
|
|
76
77
|
{
|
|
77
78
|
let _permit = semaphore.acquire().await.unwrap();
|
|
78
79
|
let start = Instant::now();
|
|
79
|
-
|
|
80
|
+
|
|
81
|
+
let extraction_future = crate::core::batch_mode::with_batch_mode(extract_fn());
|
|
82
|
+
|
|
83
|
+
let mut result = match timeout_secs {
|
|
84
|
+
Some(secs) => match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
|
|
85
|
+
Ok(inner) => inner,
|
|
86
|
+
Err(_elapsed) => {
|
|
87
|
+
let elapsed_ms = start.elapsed().as_millis() as u64;
|
|
88
|
+
Err(KreuzbergError::Timeout {
|
|
89
|
+
elapsed_ms,
|
|
90
|
+
limit_ms: secs * 1000,
|
|
91
|
+
})
|
|
92
|
+
}
|
|
93
|
+
},
|
|
94
|
+
None => extraction_future.await,
|
|
95
|
+
};
|
|
96
|
+
|
|
80
97
|
let elapsed_ms = start.elapsed().as_millis() as u64;
|
|
81
98
|
|
|
82
99
|
if let Ok(ref mut r) = result {
|
|
@@ -182,7 +199,8 @@ pub async fn batch_extract_file(
|
|
|
182
199
|
async move {
|
|
183
200
|
let (ref path, ref file_config) = items[index];
|
|
184
201
|
let resolved = resolve_config(&cfg, file_config);
|
|
185
|
-
|
|
202
|
+
let timeout = resolved.extraction_timeout_secs;
|
|
203
|
+
run_timed_extraction(index, sem, timeout, || {
|
|
186
204
|
let path = path.clone();
|
|
187
205
|
async move { extract_file(&path, None, &resolved).await }
|
|
188
206
|
})
|
|
@@ -282,7 +300,8 @@ pub async fn batch_extract_bytes(
|
|
|
282
300
|
async move {
|
|
283
301
|
let (bytes, mime_type, file_config) = slots[index].lock().take().expect("batch item already consumed");
|
|
284
302
|
let resolved = resolve_config(&cfg, &file_config);
|
|
285
|
-
|
|
303
|
+
let timeout = resolved.extraction_timeout_secs;
|
|
304
|
+
run_timed_extraction(index, sem, timeout, || async move {
|
|
286
305
|
extract_bytes(&bytes, &mime_type, &resolved).await
|
|
287
306
|
})
|
|
288
307
|
.await
|
|
@@ -35,7 +35,9 @@ pub const LEGACY_POWERPOINT_MIME_TYPE: &str = "application/vnd.ms-powerpoint";
|
|
|
35
35
|
|
|
36
36
|
pub const EML_MIME_TYPE: &str = "message/rfc822";
|
|
37
37
|
pub const MSG_MIME_TYPE: &str = "application/vnd.ms-outlook";
|
|
38
|
+
pub const PST_MIME_TYPE: &str = "application/vnd.ms-outlook-pst";
|
|
38
39
|
pub const JSON_MIME_TYPE: &str = "application/json";
|
|
40
|
+
pub const JSONL_MIME_TYPE: &str = "application/x-ndjson";
|
|
39
41
|
pub const YAML_MIME_TYPE: &str = "application/x-yaml";
|
|
40
42
|
pub const TOML_MIME_TYPE: &str = "application/toml";
|
|
41
43
|
pub const XML_MIME_TYPE: &str = "application/xml";
|
|
@@ -368,6 +370,11 @@ static FORMATS: &[FormatEntry] = &[
|
|
|
368
370
|
mime_type: "application/csl+json",
|
|
369
371
|
aliases: &[],
|
|
370
372
|
},
|
|
373
|
+
FormatEntry {
|
|
374
|
+
extensions: &["jsonl", "ndjson"],
|
|
375
|
+
mime_type: "application/x-ndjson",
|
|
376
|
+
aliases: &["application/jsonl", "application/x-jsonlines"],
|
|
377
|
+
},
|
|
371
378
|
FormatEntry {
|
|
372
379
|
extensions: &["yaml", "yml"],
|
|
373
380
|
mime_type: "application/x-yaml",
|
|
@@ -399,6 +406,11 @@ static FORMATS: &[FormatEntry] = &[
|
|
|
399
406
|
mime_type: "application/vnd.ms-outlook",
|
|
400
407
|
aliases: &[],
|
|
401
408
|
},
|
|
409
|
+
FormatEntry {
|
|
410
|
+
extensions: &["pst"],
|
|
411
|
+
mime_type: "application/vnd.ms-outlook-pst",
|
|
412
|
+
aliases: &[],
|
|
413
|
+
},
|
|
402
414
|
// ── Archives ────────────────────────────────────────────────────────
|
|
403
415
|
FormatEntry {
|
|
404
416
|
extensions: &["zip"],
|
|
@@ -124,6 +124,9 @@ pub enum KreuzbergError {
|
|
|
124
124
|
#[error("Unsupported format: {0}")]
|
|
125
125
|
UnsupportedFormat(String),
|
|
126
126
|
|
|
127
|
+
#[error("Extraction timed out after {elapsed_ms}ms (limit: {limit_ms}ms)")]
|
|
128
|
+
Timeout { elapsed_ms: u64, limit_ms: u64 },
|
|
129
|
+
|
|
127
130
|
#[error("{0}")]
|
|
128
131
|
Other(String),
|
|
129
132
|
}
|
|
@@ -22,6 +22,9 @@ pub mod archive;
|
|
|
22
22
|
#[cfg(feature = "email")]
|
|
23
23
|
pub mod email;
|
|
24
24
|
|
|
25
|
+
#[cfg(feature = "email")]
|
|
26
|
+
pub mod pst;
|
|
27
|
+
|
|
25
28
|
#[cfg(any(feature = "excel", feature = "excel-wasm"))]
|
|
26
29
|
pub mod excel;
|
|
27
30
|
|
|
@@ -77,6 +80,9 @@ pub use archive::{
|
|
|
77
80
|
#[cfg(feature = "email")]
|
|
78
81
|
pub use email::{build_email_text_output, extract_email_content, parse_eml_content, parse_msg_content};
|
|
79
82
|
|
|
83
|
+
#[cfg(feature = "email")]
|
|
84
|
+
pub use pst::extract_pst_messages;
|
|
85
|
+
|
|
80
86
|
#[cfg(any(feature = "excel", feature = "excel-wasm"))]
|
|
81
87
|
pub use excel::{excel_to_markdown, read_excel_bytes, read_excel_file};
|
|
82
88
|
|