kreuzberg 4.6.0 → 4.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  4. data/ext/kreuzberg_rb/native/src/config/types.rs +18 -0
  5. data/lib/kreuzberg/config.rb +22 -8
  6. data/lib/kreuzberg/version.rb +1 -1
  7. data/sig/kreuzberg.rbs +5 -1
  8. data/vendor/Cargo.toml +3 -3
  9. data/vendor/kreuzberg/Cargo.toml +5 -2
  10. data/vendor/kreuzberg/README.md +1 -1
  11. data/vendor/kreuzberg/src/api/error.rs +1 -0
  12. data/vendor/kreuzberg/src/api/openapi.rs +6 -0
  13. data/vendor/kreuzberg/src/core/config/extraction/core.rs +28 -0
  14. data/vendor/kreuzberg/src/core/config/extraction/file_config.rs +12 -0
  15. data/vendor/kreuzberg/src/core/extractor/batch.rs +23 -4
  16. data/vendor/kreuzberg/src/core/mime.rs +12 -0
  17. data/vendor/kreuzberg/src/error.rs +3 -0
  18. data/vendor/kreuzberg/src/extraction/mod.rs +6 -0
  19. data/vendor/kreuzberg/src/extraction/pst.rs +386 -0
  20. data/vendor/kreuzberg/src/extraction/structured.rs +214 -1
  21. data/vendor/kreuzberg/src/extraction/transform/content.rs +40 -7
  22. data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +69 -12
  23. data/vendor/kreuzberg/src/extraction/transform/mod.rs +159 -6
  24. data/vendor/kreuzberg/src/extractors/email.rs +3 -3
  25. data/vendor/kreuzberg/src/extractors/mod.rs +12 -2
  26. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +121 -52
  27. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +536 -194
  28. data/vendor/kreuzberg/src/extractors/pst.rs +264 -0
  29. data/vendor/kreuzberg/src/extractors/structured.rs +14 -4
  30. data/vendor/kreuzberg/src/mcp/errors.rs +5 -0
  31. data/vendor/kreuzberg/src/pdf/layout_runner.rs +214 -226
  32. data/vendor/kreuzberg/src/pdf/text.rs +41 -2
  33. data/vendor/kreuzberg/test_documents/jsonl/simple.jsonl +3 -0
  34. data/vendor/kreuzberg/test_documents/jsonl/with_blanks.jsonl +5 -0
  35. data/vendor/kreuzberg/tests/api_consistency.rs +7 -0
  36. data/vendor/kreuzberg/tests/jsonl_integration.rs +82 -0
  37. data/vendor/kreuzberg/tests/pst_integration.rs +82 -0
  38. data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
  39. data/vendor/kreuzberg-ffi/kreuzberg.h +24 -2
  40. data/vendor/kreuzberg-ffi/src/config/merge.rs +7 -0
  41. data/vendor/kreuzberg-ffi/src/config_builder.rs +37 -0
  42. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  43. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  44. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  45. metadata +7 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 203e9719bcf3cf2cda1252dcd7a5c5782e7b73936a304b626a351894d4fcd909
4
- data.tar.gz: 2c02a45c882ef6b6b6935896e9334c46f012aaf8bc6f6669fa3c0110b67398e5
3
+ metadata.gz: 7f2441d44e9083d36f9f0e60b2c155624ad251f97c6f2ca629bd3ef33aeeeb46
4
+ data.tar.gz: b443884a5e4bfffa1bc916928ae97997e1ca21cdfb80e76b845c454c0aa67256
5
5
  SHA512:
6
- metadata.gz: d3dde81c8c38b1ee99bed3cae32e477e4c8941d401c6449fc9c3eec3608a5b771b47c20ab3a9679ccf75059fed5e6c09f9d91eefed83a7d9dc59eebf7acb5626
7
- data.tar.gz: e590247800d9752175985ee3b8ad0c89c5926f1afa0669a881cf476455c8514332880d30ff35d46a2836cf9cdc18b752296fb06a545f42129b548b5675180a71
6
+ metadata.gz: e238336b3ceae6d2bed4bd530d393f32473a7a88f68f7e28a6300ffe2f7cadc5aee0ac834fa5712223810f9cf1f7a68a363b7e96079060565e1c9e10af9f8114
7
+ data.tar.gz: 48eae2f77c78ef1b9794ac3dde62c785b32ab99253b6ab7ce8da9d8bba8a57f209fb5fc6353a0bc8f5771bd825fc3b3b29652149133bd0ba1dc29e58101185bb
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.1" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.6.0"
3
+ version = "4.6.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -875,6 +875,17 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
875
875
  config.force_ocr = bool::try_convert(val)?;
876
876
  }
877
877
 
878
+ if let Some(val) = get_kw(ruby, hash, "force_ocr_pages")
879
+ && val.equal(ruby.qnil()).ok() != Some(true)
880
+ {
881
+ let pages_array = magnus::RArray::try_convert(val)?;
882
+ let pages: Vec<usize> = pages_array
883
+ .into_iter()
884
+ .map(|v| usize::try_convert(v))
885
+ .collect::<Result<Vec<_>, _>>()?;
886
+ config.force_ocr_pages = Some(pages);
887
+ }
888
+
878
889
  if let Some(val) = get_kw(ruby, hash, "include_document_structure") {
879
890
  config.include_document_structure = bool::try_convert(val)?;
880
891
  }
@@ -1013,6 +1024,13 @@ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extra
1013
1024
  }
1014
1025
  };
1015
1026
  }
1027
+
1028
+ if let Some(val) = get_kw(ruby, hash, "extraction_timeout_secs")
1029
+ && val.equal(ruby.qnil()).ok() != Some(true)
1030
+ {
1031
+ let secs = u64::try_convert(val)?;
1032
+ config.extraction_timeout_secs = Some(secs);
1033
+ }
1016
1034
  }
1017
1035
 
1018
1036
  Ok(config)
@@ -926,14 +926,14 @@ module Kreuzberg
926
926
  # )
927
927
  #
928
928
  class Extraction
929
- attr_reader :use_cache, :enable_quality_processing, :force_ocr,
929
+ attr_reader :use_cache, :enable_quality_processing, :force_ocr, :force_ocr_pages,
930
930
  :include_document_structure,
931
931
  :ocr, :chunking, :language_detection, :pdf_options,
932
932
  :images, :postprocessor,
933
933
  :token_reduction, :keywords, :html_options, :pages,
934
934
  :max_concurrent_extractions, :output_format, :result_format,
935
935
  :security_limits, :layout, :concurrency,
936
- :cache_namespace, :cache_ttl_secs
936
+ :cache_namespace, :cache_ttl_secs, :extraction_timeout_secs
937
937
 
938
938
  # Alias for backward compatibility - image_extraction is the canonical name
939
939
  alias image_extraction images
@@ -954,11 +954,11 @@ module Kreuzberg
954
954
  #
955
955
  # Keys that are allowed in the Extraction config
956
956
  ALLOWED_KEYS = %i[
957
- use_cache enable_quality_processing force_ocr include_document_structure ocr chunking
957
+ use_cache enable_quality_processing force_ocr force_ocr_pages include_document_structure ocr chunking
958
958
  language_detection pdf_options image_extraction
959
959
  postprocessor token_reduction keywords html_options pages
960
960
  max_concurrent_extractions output_format result_format
961
- security_limits layout concurrency cache_namespace cache_ttl_secs
961
+ security_limits layout concurrency cache_namespace cache_ttl_secs extraction_timeout_secs
962
962
  ].freeze
963
963
 
964
964
  # Aliases for backward compatibility
@@ -1019,6 +1019,7 @@ module Kreuzberg
1019
1019
  use_cache: true,
1020
1020
  enable_quality_processing: true,
1021
1021
  force_ocr: false,
1022
+ force_ocr_pages: nil,
1022
1023
  include_document_structure: false,
1023
1024
  ocr: nil,
1024
1025
  chunking: nil,
@@ -1037,10 +1038,12 @@ module Kreuzberg
1037
1038
  layout: nil,
1038
1039
  concurrency: nil,
1039
1040
  cache_namespace: nil,
1040
- cache_ttl_secs: nil)
1041
+ cache_ttl_secs: nil,
1042
+ extraction_timeout_secs: nil)
1041
1043
  kwargs = {
1042
1044
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
1043
- force_ocr: force_ocr, include_document_structure: include_document_structure,
1045
+ force_ocr: force_ocr, force_ocr_pages: force_ocr_pages,
1046
+ include_document_structure: include_document_structure,
1044
1047
  ocr: ocr, chunking: chunking, language_detection: language_detection,
1045
1048
  pdf_options: pdf_options, image_extraction: image_extraction,
1046
1049
  postprocessor: postprocessor,
@@ -1050,7 +1053,8 @@ module Kreuzberg
1050
1053
  security_limits: security_limits, layout: layout,
1051
1054
  concurrency: concurrency,
1052
1055
  cache_namespace: cache_namespace,
1053
- cache_ttl_secs: cache_ttl_secs
1056
+ cache_ttl_secs: cache_ttl_secs,
1057
+ extraction_timeout_secs: extraction_timeout_secs
1054
1058
  }
1055
1059
  extracted = extract_from_hash(hash, kwargs)
1056
1060
 
@@ -1068,6 +1072,7 @@ module Kreuzberg
1068
1072
  @use_cache = params[:use_cache] ? true : false
1069
1073
  @enable_quality_processing = params[:enable_quality_processing] ? true : false
1070
1074
  @force_ocr = params[:force_ocr] ? true : false
1075
+ @force_ocr_pages = params[:force_ocr_pages]
1071
1076
  @include_document_structure = params[:include_document_structure] ? true : false
1072
1077
  @ocr = normalize_config(params[:ocr], OCR)
1073
1078
  @chunking = normalize_config(params[:chunking], Chunking)
@@ -1086,6 +1091,7 @@ module Kreuzberg
1086
1091
  @result_format = validate_result_format(params[:result_format])
1087
1092
  @cache_namespace = params[:cache_namespace]
1088
1093
  @cache_ttl_secs = params[:cache_ttl_secs]&.to_i
1094
+ @extraction_timeout_secs = params[:extraction_timeout_secs]&.to_i
1089
1095
  @security_limits = params[:security_limits]
1090
1096
  end
1091
1097
 
@@ -1118,12 +1124,14 @@ module Kreuzberg
1118
1124
  use_cache: @use_cache,
1119
1125
  enable_quality_processing: @enable_quality_processing,
1120
1126
  force_ocr: @force_ocr,
1127
+ force_ocr_pages: @force_ocr_pages,
1121
1128
  include_document_structure: @include_document_structure,
1122
1129
  max_concurrent_extractions: @max_concurrent_extractions,
1123
1130
  output_format: @output_format,
1124
1131
  result_format: @result_format,
1125
1132
  cache_namespace: @cache_namespace,
1126
- cache_ttl_secs: @cache_ttl_secs
1133
+ cache_ttl_secs: @cache_ttl_secs,
1134
+ extraction_timeout_secs: @extraction_timeout_secs
1127
1135
  }
1128
1136
  end
1129
1137
 
@@ -1250,6 +1258,8 @@ module Kreuzberg
1250
1258
  @enable_quality_processing = value ? true : false
1251
1259
  when :force_ocr
1252
1260
  @force_ocr = value ? true : false
1261
+ when :force_ocr_pages
1262
+ @force_ocr_pages = value
1253
1263
  when :include_document_structure
1254
1264
  @include_document_structure = value ? true : false
1255
1265
  when :ocr
@@ -1286,6 +1296,8 @@ module Kreuzberg
1286
1296
  @cache_namespace = value
1287
1297
  when :cache_ttl_secs
1288
1298
  @cache_ttl_secs = value&.to_i
1299
+ when :extraction_timeout_secs
1300
+ @extraction_timeout_secs = value&.to_i
1289
1301
  else
1290
1302
  raise ArgumentError, "Unknown configuration key: #{key}"
1291
1303
  end
@@ -1345,6 +1357,7 @@ module Kreuzberg
1345
1357
  @use_cache = merged.use_cache
1346
1358
  @enable_quality_processing = merged.enable_quality_processing
1347
1359
  @force_ocr = merged.force_ocr
1360
+ @force_ocr_pages = merged.force_ocr_pages
1348
1361
  @include_document_structure = merged.include_document_structure
1349
1362
  @ocr = merged.ocr
1350
1363
  @chunking = merged.chunking
@@ -1369,6 +1382,7 @@ module Kreuzberg
1369
1382
  @result_format = merged.result_format
1370
1383
  @cache_namespace = merged.cache_namespace
1371
1384
  @cache_ttl_secs = merged.cache_ttl_secs
1385
+ @extraction_timeout_secs = merged.extraction_timeout_secs
1372
1386
  end
1373
1387
  end
1374
1388
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.6.0'
4
+ VERSION = '4.6.1'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -481,7 +481,9 @@ module Kreuzberg
481
481
  attr_reader enable_quality_processing: bool
482
482
  attr_reader cache_namespace: String?
483
483
  attr_reader cache_ttl_secs: Integer?
484
+ attr_reader extraction_timeout_secs: Integer?
484
485
  attr_reader force_ocr: bool
486
+ attr_reader force_ocr_pages: Array[Integer]?
485
487
  attr_reader include_document_structure: bool
486
488
  attr_reader ocr: OCR?
487
489
  attr_reader chunking: Chunking?
@@ -508,6 +510,7 @@ module Kreuzberg
508
510
  ?use_cache: bool,
509
511
  ?enable_quality_processing: bool,
510
512
  ?force_ocr: bool,
513
+ ?force_ocr_pages: Array[Integer]?,
511
514
  ?include_document_structure: bool,
512
515
  ?ocr: (OCR | Hash[Symbol, untyped])?,
513
516
  ?chunking: (Chunking | Hash[Symbol, untyped])?,
@@ -525,7 +528,8 @@ module Kreuzberg
525
528
  ?output_format: String?,
526
529
  ?result_format: String?,
527
530
  ?cache_namespace: String?,
528
- ?cache_ttl_secs: Integer?
531
+ ?cache_ttl_secs: Integer?,
532
+ ?extraction_timeout_secs: Integer?
529
533
  ) -> void
530
534
  def to_h: () -> Hash[Symbol, untyped]
531
535
  def to_json: (*untyped) -> String
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.6.0"
5
+ version = "4.6.1"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -30,8 +30,8 @@ html-to-markdown-rs = { version = "2.29.0", default-features = false }
30
30
  image = { version = "0.25.10", default-features = false }
31
31
  itertools = "0.14"
32
32
  js-sys = "0.3"
33
- kreuzberg = { path = "./crates/kreuzberg", version = "4.6.0", default-features = false }
34
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.0" }
33
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.6.1", default-features = false }
34
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.6.1" }
35
35
  lazy_static = "1.5.0"
36
36
  libc = "0.2.183"
37
37
  log = "0.4"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.6.0"
3
+ version = "4.6.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -53,7 +53,7 @@ office = [
53
53
  ]
54
54
  hwp = ["dep:cfb", "dep:flate2"]
55
55
  iwork = ["dep:zip", "dep:snap"]
56
- email = ["dep:mail-parser", "dep:cfb"]
56
+ email = ["dep:mail-parser", "dep:cfb", "dep:outlook-pst", "dep:tempfile", "dep:chrono"]
57
57
  html = ["dep:html-to-markdown-rs"]
58
58
  xml = ["dep:quick-xml", "dep:roxmltree"]
59
59
  archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
@@ -238,6 +238,7 @@ bytes = { version = "1", features = ["serde"] }
238
238
  calamine = { version = "0.34.0", features = ["dates"], optional = true }
239
239
  cfb = { version = "0.14", optional = true }
240
240
  chardetng = { version = "0.1.17", optional = true }
241
+ chrono = { version = "0.4", optional = true }
241
242
  dashmap = "6.1"
242
243
  dbase = { version = "0.7", optional = true }
243
244
  encoding_rs = { version = "0.8.35" }
@@ -287,6 +288,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
287
288
  "ndarray",
288
289
  "api-18",
289
290
  ], optional = true }
291
+ outlook-pst = { version = "1.2.0", optional = true }
290
292
  parking_lot = "0.12.5"
291
293
  pastey = "0.2"
292
294
  pdf_oxide = { version = "0.3.17", default-features = false, optional = true }
@@ -315,6 +317,7 @@ sha2 = { version = "0.10", optional = true }
315
317
  simdutf8 = { version = "0.1", optional = true }
316
318
  snap = { version = "1.1", optional = true }
317
319
  tar = { version = "0.4.45", optional = true }
320
+ tempfile = { version = "3.27.0", optional = true }
318
321
  text-splitter = { version = "0.29.3", features = ["markdown"], optional = true }
319
322
  thiserror = "2.0.18"
320
323
  tiff = { version = "0.11", optional = true }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.6.0 Release**
21
+ > **🚀 Version 4.6.1 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -115,6 +115,7 @@ impl ApiError {
115
115
  KreuzbergError::Plugin { .. } => "PluginError",
116
116
  KreuzbergError::LockPoisoned(_) => "LockPoisonedError",
117
117
  KreuzbergError::UnsupportedFormat(_) => "UnsupportedFormatError",
118
+ KreuzbergError::Timeout { .. } => "TimeoutError",
118
119
  KreuzbergError::Other(_) => "Error",
119
120
  };
120
121
 
@@ -74,6 +74,12 @@ use utoipa::OpenApi;
74
74
  crate::types::extraction::ElementId,
75
75
  crate::types::extraction::ElementType,
76
76
  crate::types::extraction::BoundingBox,
77
+ crate::types::ocr_elements::OcrElement,
78
+ crate::types::ocr_elements::OcrBoundingGeometry,
79
+ crate::types::ocr_elements::OcrConfidence,
80
+ crate::types::ocr_elements::OcrRotation,
81
+ crate::types::ocr_elements::OcrElementLevel,
82
+ crate::types::ocr_elements::OcrElementConfig,
77
83
  crate::types::metadata::Metadata,
78
84
  crate::types::tables::Table,
79
85
  crate::types::page::PageContent,
@@ -47,6 +47,16 @@ pub struct ExtractionConfig {
47
47
  #[serde(default)]
48
48
  pub force_ocr: bool,
49
49
 
50
+ /// Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
51
+ ///
52
+ /// When set, only the listed pages are OCR'd regardless of text layer quality.
53
+ /// Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
54
+ /// Only applies to PDF documents. Duplicates are automatically deduplicated.
55
+ /// An `ocr` config is recommended for backend/language selection; defaults are used if absent.
56
+ #[serde(default)]
57
+ #[serde(skip_serializing_if = "Option::is_none")]
58
+ pub force_ocr_pages: Option<Vec<usize>>,
59
+
50
60
  /// Text chunking configuration (None = chunking disabled)
51
61
  #[serde(default)]
52
62
  pub chunking: Option<ChunkingConfig>,
@@ -89,6 +99,14 @@ pub struct ExtractionConfig {
89
99
  #[serde(default)]
90
100
  pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
91
101
 
102
+ /// Default per-file timeout in seconds for batch extraction.
103
+ ///
104
+ /// When set, each file in a batch will be canceled after this duration
105
+ /// unless overridden by [`FileExtractionConfig::timeout_secs`].
106
+ /// `None` means no timeout (unbounded extraction time).
107
+ #[serde(default)]
108
+ pub extraction_timeout_secs: Option<u64>,
109
+
92
110
  /// Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
93
111
  ///
94
112
  /// Limits parallelism to prevent resource exhaustion when processing
@@ -201,6 +219,7 @@ impl Default for ExtractionConfig {
201
219
  enable_quality_processing: true,
202
220
  ocr: None,
203
221
  force_ocr: false,
222
+ force_ocr_pages: None,
204
223
  chunking: None,
205
224
  images: None,
206
225
  #[cfg(feature = "pdf")]
@@ -213,6 +232,7 @@ impl Default for ExtractionConfig {
213
232
  postprocessor: None,
214
233
  #[cfg(feature = "html")]
215
234
  html_options: None,
235
+ extraction_timeout_secs: None,
216
236
  max_concurrent_extractions: None,
217
237
  #[cfg(feature = "archives")]
218
238
  security_limits: None,
@@ -259,6 +279,7 @@ impl ExtractionConfig {
259
279
  ref enable_quality_processing,
260
280
  ref ocr,
261
281
  ref force_ocr,
282
+ ref force_ocr_pages,
262
283
  ref chunking,
263
284
  ref images,
264
285
  #[cfg(feature = "pdf")]
@@ -276,6 +297,7 @@ impl ExtractionConfig {
276
297
  ref include_document_structure,
277
298
  #[cfg(feature = "layout-detection")]
278
299
  ref layout,
300
+ ref timeout_secs,
279
301
  } = *overrides;
280
302
 
281
303
  let mut config = self.clone();
@@ -289,6 +311,9 @@ impl ExtractionConfig {
289
311
  if let Some(v) = force_ocr {
290
312
  config.force_ocr = *v;
291
313
  }
314
+ if let Some(v) = force_ocr_pages {
315
+ config.force_ocr_pages = Some(v.clone());
316
+ }
292
317
  if let Some(v) = chunking {
293
318
  config.chunking = Some(v.clone());
294
319
  }
@@ -332,6 +357,9 @@ impl ExtractionConfig {
332
357
  if let Some(v) = layout {
333
358
  config.layout = Some(v.clone());
334
359
  }
360
+ if let Some(v) = timeout_secs {
361
+ config.extraction_timeout_secs = Some(*v);
362
+ }
335
363
 
336
364
  config
337
365
  }
@@ -57,6 +57,10 @@ pub struct FileExtractionConfig {
57
57
  #[serde(skip_serializing_if = "Option::is_none")]
58
58
  pub force_ocr: Option<bool>,
59
59
 
60
+ /// Override force OCR pages for this file (1-indexed page numbers).
61
+ #[serde(skip_serializing_if = "Option::is_none")]
62
+ pub force_ocr_pages: Option<Vec<usize>>,
63
+
60
64
  /// Override chunking configuration for this file.
61
65
  #[serde(skip_serializing_if = "Option::is_none")]
62
66
  pub chunking: Option<ChunkingConfig>,
@@ -112,4 +116,12 @@ pub struct FileExtractionConfig {
112
116
  #[cfg(feature = "layout-detection")]
113
117
  #[serde(skip_serializing_if = "Option::is_none")]
114
118
  pub layout: Option<super::super::layout::LayoutDetectionConfig>,
119
+
120
+ /// Override per-file extraction timeout in seconds.
121
+ ///
122
+ /// When set, the extraction for this file will be canceled after the
123
+ /// specified duration. A timed-out file produces an error result without
124
+ /// affecting other files in the batch.
125
+ #[serde(skip_serializing_if = "Option::is_none")]
126
+ pub timeout_secs: Option<u64>,
115
127
  }
@@ -63,11 +63,12 @@ where
63
63
  Ok(results.into_iter().map(|r| r.unwrap()).collect())
64
64
  }
65
65
 
66
- /// Run a single extraction task with semaphore gating, timing, and batch mode.
66
+ /// Run a single extraction task with semaphore gating, timing, optional timeout, and batch mode.
67
67
  #[cfg(feature = "tokio-runtime")]
68
68
  async fn run_timed_extraction<F, Fut>(
69
69
  index: usize,
70
70
  semaphore: Arc<tokio::sync::Semaphore>,
71
+ timeout_secs: Option<u64>,
71
72
  extract_fn: F,
72
73
  ) -> (usize, Result<ExtractionResult>, u64)
73
74
  where
@@ -76,7 +77,23 @@ where
76
77
  {
77
78
  let _permit = semaphore.acquire().await.unwrap();
78
79
  let start = Instant::now();
79
- let mut result = crate::core::batch_mode::with_batch_mode(extract_fn()).await;
80
+
81
+ let extraction_future = crate::core::batch_mode::with_batch_mode(extract_fn());
82
+
83
+ let mut result = match timeout_secs {
84
+ Some(secs) => match tokio::time::timeout(std::time::Duration::from_secs(secs), extraction_future).await {
85
+ Ok(inner) => inner,
86
+ Err(_elapsed) => {
87
+ let elapsed_ms = start.elapsed().as_millis() as u64;
88
+ Err(KreuzbergError::Timeout {
89
+ elapsed_ms,
90
+ limit_ms: secs * 1000,
91
+ })
92
+ }
93
+ },
94
+ None => extraction_future.await,
95
+ };
96
+
80
97
  let elapsed_ms = start.elapsed().as_millis() as u64;
81
98
 
82
99
  if let Ok(ref mut r) = result {
@@ -182,7 +199,8 @@ pub async fn batch_extract_file(
182
199
  async move {
183
200
  let (ref path, ref file_config) = items[index];
184
201
  let resolved = resolve_config(&cfg, file_config);
185
- run_timed_extraction(index, sem, || {
202
+ let timeout = resolved.extraction_timeout_secs;
203
+ run_timed_extraction(index, sem, timeout, || {
186
204
  let path = path.clone();
187
205
  async move { extract_file(&path, None, &resolved).await }
188
206
  })
@@ -282,7 +300,8 @@ pub async fn batch_extract_bytes(
282
300
  async move {
283
301
  let (bytes, mime_type, file_config) = slots[index].lock().take().expect("batch item already consumed");
284
302
  let resolved = resolve_config(&cfg, &file_config);
285
- run_timed_extraction(index, sem, || async move {
303
+ let timeout = resolved.extraction_timeout_secs;
304
+ run_timed_extraction(index, sem, timeout, || async move {
286
305
  extract_bytes(&bytes, &mime_type, &resolved).await
287
306
  })
288
307
  .await
@@ -35,7 +35,9 @@ pub const LEGACY_POWERPOINT_MIME_TYPE: &str = "application/vnd.ms-powerpoint";
35
35
 
36
36
  pub const EML_MIME_TYPE: &str = "message/rfc822";
37
37
  pub const MSG_MIME_TYPE: &str = "application/vnd.ms-outlook";
38
+ pub const PST_MIME_TYPE: &str = "application/vnd.ms-outlook-pst";
38
39
  pub const JSON_MIME_TYPE: &str = "application/json";
40
+ pub const JSONL_MIME_TYPE: &str = "application/x-ndjson";
39
41
  pub const YAML_MIME_TYPE: &str = "application/x-yaml";
40
42
  pub const TOML_MIME_TYPE: &str = "application/toml";
41
43
  pub const XML_MIME_TYPE: &str = "application/xml";
@@ -368,6 +370,11 @@ static FORMATS: &[FormatEntry] = &[
368
370
  mime_type: "application/csl+json",
369
371
  aliases: &[],
370
372
  },
373
+ FormatEntry {
374
+ extensions: &["jsonl", "ndjson"],
375
+ mime_type: "application/x-ndjson",
376
+ aliases: &["application/jsonl", "application/x-jsonlines"],
377
+ },
371
378
  FormatEntry {
372
379
  extensions: &["yaml", "yml"],
373
380
  mime_type: "application/x-yaml",
@@ -399,6 +406,11 @@ static FORMATS: &[FormatEntry] = &[
399
406
  mime_type: "application/vnd.ms-outlook",
400
407
  aliases: &[],
401
408
  },
409
+ FormatEntry {
410
+ extensions: &["pst"],
411
+ mime_type: "application/vnd.ms-outlook-pst",
412
+ aliases: &[],
413
+ },
402
414
  // ── Archives ────────────────────────────────────────────────────────
403
415
  FormatEntry {
404
416
  extensions: &["zip"],
@@ -124,6 +124,9 @@ pub enum KreuzbergError {
124
124
  #[error("Unsupported format: {0}")]
125
125
  UnsupportedFormat(String),
126
126
 
127
+ #[error("Extraction timed out after {elapsed_ms}ms (limit: {limit_ms}ms)")]
128
+ Timeout { elapsed_ms: u64, limit_ms: u64 },
129
+
127
130
  #[error("{0}")]
128
131
  Other(String),
129
132
  }
@@ -22,6 +22,9 @@ pub mod archive;
22
22
  #[cfg(feature = "email")]
23
23
  pub mod email;
24
24
 
25
+ #[cfg(feature = "email")]
26
+ pub mod pst;
27
+
25
28
  #[cfg(any(feature = "excel", feature = "excel-wasm"))]
26
29
  pub mod excel;
27
30
 
@@ -77,6 +80,9 @@ pub use archive::{
77
80
  #[cfg(feature = "email")]
78
81
  pub use email::{build_email_text_output, extract_email_content, parse_eml_content, parse_msg_content};
79
82
 
83
+ #[cfg(feature = "email")]
84
+ pub use pst::extract_pst_messages;
85
+
80
86
  #[cfg(any(feature = "excel", feature = "excel-wasm"))]
81
87
  pub use excel::{excel_to_markdown, read_excel_bytes, read_excel_file};
82
88