kreuzberg 4.9.0 → 4.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.lock +7 -7
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +4 -4
  7. data/vendor/kreuzberg/Cargo.toml +2 -2
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/ocr.rs +33 -35
  10. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +1 -3
  11. data/vendor/kreuzberg/src/extractors/doc.rs +6 -1
  12. data/vendor/kreuzberg/src/extractors/excel.rs +3 -0
  13. data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +6 -1
  14. data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +6 -1
  15. data/vendor/kreuzberg/src/extractors/iwork/pages.rs +6 -1
  16. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +18 -0
  17. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +21 -2
  18. data/vendor/kreuzberg/src/extractors/ppt.rs +6 -1
  19. data/vendor/kreuzberg/src/keywords/yake/preprocessor.rs +17 -19
  20. data/vendor/kreuzberg/src/layout/model_manager.rs +10 -0
  21. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  22. data/vendor/kreuzberg/src/pdf/oxide/metadata.rs +2 -2
  23. data/vendor/kreuzberg/src/pdf/oxide/table.rs +4 -6
  24. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +630 -61
  25. data/vendor/kreuzberg/src/pdf/structure/regions/tables.rs +24 -0
  26. data/vendor/kreuzberg-ffi/Cargo.toml +2 -2
  27. data/vendor/kreuzberg-ffi/kreuzberg.h +4 -4
  28. data/vendor/kreuzberg-ffi/src/error.rs +9 -8
  29. data/vendor/kreuzberg-ffi/tests/c/test_error.c +4 -1
  30. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  31. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  32. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  33. data/vendor/kreuzberg-tesseract/build.rs +52 -32
  34. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bb7b77bae36a5da34ce209fbf1ea7c0a68aef4b22f8b373b908f9c113f404ef5
4
- data.tar.gz: a6c8667aee6ae2c9e11d45fc98fcb355561fec6e4a7d51d852664bd6367af8cc
3
+ metadata.gz: 431f2a7c19d9a4404369b7c9fe80fd10233b34b3462cf3c0076aac9b7f3065cb
4
+ data.tar.gz: bf36fd9dd46ffb84374e8f64ffb1f61ac6774cc8dd56d43d404e0b8995813ffc
5
5
  SHA512:
6
- metadata.gz: 7569a4914ab4a4d440a0c74e622a9f26f7189b62bc9c2d05fc5e857a32c8fabde8eb854edef34e94bd95d5357e44137c1573e7ce68db45ed85c26dbe31e6972b
7
- data.tar.gz: 9741106549d7bf79cc1ae34a07f686cca1bf6a4c19fcb01b40cd8f1372166c8e9c3a0321e1e26e416ebb98d413ee1c9093d247949292a7c07a85594ea1df508e
6
+ metadata.gz: 2cf3ad1e3edd0778537e542bcafc121b6cd14235e271d2b9b27747f7beab7a2899362426f450a06f9ff0d0d4365982acb68300e46568887a871ad1bda6767d00
7
+ data.tar.gz: e7bcbe73f50701fbd4a0cb7e75e588096c02202198b234ba6f830b846f4fabd95e051ed0c25ed4a1c1d72a933cf5ccf9d755129eb1dc54ea446a60d9d9946997
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.2" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -2916,7 +2916,7 @@ dependencies = [
2916
2916
 
2917
2917
  [[package]]
2918
2918
  name = "kreuzberg-rb"
2919
- version = "4.8.6"
2919
+ version = "4.9.2"
2920
2920
  dependencies = [
2921
2921
  "async-trait",
2922
2922
  "html-to-markdown-rs",
@@ -3040,9 +3040,9 @@ checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
3040
3040
 
3041
3041
  [[package]]
3042
3042
  name = "liter-llm"
3043
- version = "1.2.1"
3043
+ version = "1.2.2"
3044
3044
  source = "registry+https://github.com/rust-lang/crates.io-index"
3045
- checksum = "1884be380e45da823105c85ef0fa188af81d57be7de9b65016576e1774fdd5f8"
3045
+ checksum = "4e4ce5d2d0b09f2e63537ba40b15b0a95c2d6818ed0454eb04d9593ba4a0cad3"
3046
3046
  dependencies = [
3047
3047
  "base64 0.22.1",
3048
3048
  "bytes",
@@ -5734,9 +5734,9 @@ checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
5734
5734
 
5735
5735
  [[package]]
5736
5736
  name = "typenum"
5737
- version = "1.19.0"
5737
+ version = "1.20.0"
5738
5738
  source = "registry+https://github.com/rust-lang/crates.io-index"
5739
- checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
5739
+ checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
5740
5740
 
5741
5741
  [[package]]
5742
5742
  name = "unicase"
@@ -6186,9 +6186,9 @@ dependencies = [
6186
6186
 
6187
6187
  [[package]]
6188
6188
  name = "web_atoms"
6189
- version = "0.2.3"
6189
+ version = "0.2.4"
6190
6190
  source = "registry+https://github.com/rust-lang/crates.io-index"
6191
- checksum = "57a9779e9f04d2ac1ce317aee707aa2f6b773afba7b931222bff6983843b1576"
6191
+ checksum = "d7cff6eef815df1834fd250e3a2ff436044d82a9f1bc1980ca1dbdf07effc538"
6192
6192
  dependencies = [
6193
6193
  "phf",
6194
6194
  "phf_codegen",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.9.0"
3
+ version = "4.9.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.9.0'
4
+ VERSION = '4.9.2'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.9.0"
5
+ version = "4.9.2"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.2.5", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.9.0", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.0" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.9.2", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.9.2" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.185"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.33", default-features = false }
48
+ pdf_oxide = { version = "0.3.34", default-features = false }
49
49
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
50
  rayon = "1.12.0"
51
51
  reqwest = { version = "0.13.2", default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.9.0"
3
+ version = "4.9.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -314,7 +314,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
314
  outlook-pst = { version = "1.2.0", optional = true }
315
315
  parking_lot = "0.12.5"
316
316
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.33", default-features = false, optional = true }
317
+ pdf_oxide = { version = "0.3.34", default-features = false, optional = true }
318
318
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
319
  pulldown-cmark = { version = "0.13" }
320
320
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.9.0 Release**
21
+ > **🚀 Version 4.9.2 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -323,9 +323,12 @@ impl OcrConfig {
323
323
  /// Returns the effective pipeline config.
324
324
  ///
325
325
  /// - If `pipeline` is explicitly set, returns it.
326
- /// - If `paddle-ocr` feature is compiled in and no explicit pipeline is set,
327
- /// auto-constructs a default pipeline: primary backend (priority 100) + paddleocr (priority 50).
328
- /// - Otherwise returns `None` (single-backend mode, same as today).
326
+ /// - If `paddle-ocr` is compiled in and the backend is the default
327
+ /// (tesseract), auto-constructs `[tesseract @ 100, paddleocr @ 50]`.
328
+ /// - Otherwise returns `None` (single-backend mode).
329
+ ///
330
+ /// Explicit non-default backend selections are honored as-is — a silent
331
+ /// paddleocr fallback would mask errors from the chosen backend.
329
332
  pub fn effective_pipeline(&self) -> Option<OcrPipelineConfig> {
330
333
  if self.pipeline.is_some() {
331
334
  return self.pipeline.clone();
@@ -333,25 +336,28 @@ impl OcrConfig {
333
336
 
334
337
  #[cfg(feature = "paddle-ocr")]
335
338
  {
336
- let mut stages = vec![OcrPipelineStage {
337
- backend: self.backend.clone(),
338
- priority: 100,
339
- language: None,
340
- tesseract_config: self.tesseract_config.clone(),
341
- paddle_ocr_config: None,
342
- vlm_config: self.vlm_config.clone(),
343
- }];
344
- // Only add paddleocr fallback if primary backend isn't already paddleocr
345
- if self.backend != "paddleocr" {
346
- stages.push(OcrPipelineStage {
339
+ if self.backend != default_tesseract_backend() {
340
+ return None;
341
+ }
342
+
343
+ let stages = vec![
344
+ OcrPipelineStage {
345
+ backend: self.backend.clone(),
346
+ priority: 100,
347
+ language: None,
348
+ tesseract_config: self.tesseract_config.clone(),
349
+ paddle_ocr_config: None,
350
+ vlm_config: self.vlm_config.clone(),
351
+ },
352
+ OcrPipelineStage {
347
353
  backend: "paddleocr".to_string(),
348
354
  priority: 50,
349
355
  language: None,
350
356
  tesseract_config: None,
351
357
  paddle_ocr_config: self.paddle_ocr_config.clone(),
352
358
  vlm_config: None,
353
- });
354
- }
359
+ },
360
+ ];
355
361
  Some(OcrPipelineConfig {
356
362
  stages,
357
363
  quality_thresholds: self.effective_thresholds(),
@@ -485,29 +491,21 @@ mod tests {
485
491
  }
486
492
 
487
493
  #[test]
488
- fn test_effective_pipeline_paddleocr_backend_no_duplicate() {
489
- // When primary backend is "paddleocr", effective_pipeline should NOT add
490
- // a second paddleocr stage (issue #6 fix).
494
+ fn test_effective_pipeline_explicit_paddleocr_no_autofallback() {
491
495
  let config = OcrConfig {
492
496
  backend: "paddleocr".to_string(),
493
497
  ..Default::default()
494
498
  };
495
- let result = config.effective_pipeline();
496
- // With paddle-ocr feature: should have exactly 1 stage (no duplicate)
497
- // Without paddle-ocr feature: should be None
498
- #[cfg(feature = "paddle-ocr")]
499
- {
500
- let pipeline = result.unwrap();
501
- let paddle_count = pipeline.stages.iter().filter(|s| s.backend == "paddleocr").count();
502
- assert_eq!(
503
- paddle_count, 1,
504
- "Should not have duplicate paddleocr stages, found {paddle_count}"
505
- );
506
- }
507
- #[cfg(not(feature = "paddle-ocr"))]
508
- {
509
- assert!(result.is_none());
510
- }
499
+ assert!(config.effective_pipeline().is_none());
500
+ }
501
+
502
+ #[test]
503
+ fn test_effective_pipeline_explicit_easyocr_no_autofallback() {
504
+ let config = OcrConfig {
505
+ backend: "easyocr".to_string(),
506
+ ..Default::default()
507
+ };
508
+ assert!(config.effective_pipeline().is_none());
511
509
  }
512
510
 
513
511
  #[test]
@@ -282,9 +282,7 @@ fn extract_pptx_from_container<R: std::io::Read + std::io::Seek>(
282
282
  }),
283
283
  });
284
284
 
285
- let document = doc_builder
286
- .map(|b| b.build())
287
- .and_then(|d| if d.is_empty() { None } else { Some(d) });
285
+ let document = doc_builder.map(|b| b.build()).filter(|d| !d.is_empty());
288
286
 
289
287
  Ok(PptxExtractionResult {
290
288
  content,
@@ -85,7 +85,12 @@ impl DocumentExtractor for DocExtractor {
85
85
  }
86
86
 
87
87
  #[cfg(not(feature = "tokio-runtime"))]
88
- extract_doc_text(content)
88
+ {
89
+ if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
90
+ return Err(crate::error::KreuzbergError::Cancelled);
91
+ }
92
+ extract_doc_text(content)
93
+ }
89
94
  }?;
90
95
 
91
96
  let mut doc = InternalDocument::new("doc");
@@ -215,6 +215,9 @@ impl DocumentExtractor for ExcelExtractor {
215
215
  }
216
216
  #[cfg(not(feature = "tokio-runtime"))]
217
217
  {
218
+ if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
219
+ return Err(crate::error::KreuzbergError::Cancelled);
220
+ }
218
221
  crate::extraction::excel::read_excel_bytes(content, extension)?
219
222
  }
220
223
  };
@@ -172,7 +172,12 @@ impl DocumentExtractor for KeynoteExtractor {
172
172
  }
173
173
 
174
174
  #[cfg(not(feature = "tokio-runtime"))]
175
- parse_keynote(content)?
175
+ {
176
+ if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
177
+ return Err(crate::error::KreuzbergError::Cancelled);
178
+ }
179
+ parse_keynote(content)?
180
+ }
176
181
  };
177
182
 
178
183
  let mut doc = build_keynote_internal_document(&data);
@@ -181,7 +181,12 @@ impl DocumentExtractor for NumbersExtractor {
181
181
  }
182
182
 
183
183
  #[cfg(not(feature = "tokio-runtime"))]
184
- parse_numbers(content)?
184
+ {
185
+ if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
186
+ return Err(crate::error::KreuzbergError::Cancelled);
187
+ }
188
+ parse_numbers(content)?
189
+ }
185
190
  };
186
191
 
187
192
  let mut doc = build_numbers_internal_document(&data);
@@ -163,7 +163,12 @@ impl DocumentExtractor for PagesExtractor {
163
163
  }
164
164
 
165
165
  #[cfg(not(feature = "tokio-runtime"))]
166
- parse_pages(content)?
166
+ {
167
+ if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
168
+ return Err(crate::error::KreuzbergError::Cancelled);
169
+ }
170
+ parse_pages(content)?
171
+ }
167
172
  };
168
173
 
169
174
  let mut doc = build_pages_internal_document(&data);
@@ -420,6 +420,10 @@ pub(crate) fn extract_all_from_oxide_document(
420
420
  content: &[u8],
421
421
  config: &ExtractionConfig,
422
422
  layout_hints: Option<&[Vec<crate::pdf::structure::types::LayoutHint>]>,
423
+ #[cfg(feature = "layout-detection")] layout_images: Option<&[image::DynamicImage]>,
424
+ #[cfg(not(feature = "layout-detection"))] _layout_images: Option<()>,
425
+ #[cfg(feature = "layout-detection")] layout_results: Option<&[crate::pdf::layout_runner::PageLayoutResult]>,
426
+ #[cfg(not(feature = "layout-detection"))] _layout_results: Option<()>,
423
427
  ) -> Result<PdfExtractionPhaseResult> {
424
428
  let _span = tracing::debug_span!("extract_pdf_oxide").entered();
425
429
 
@@ -460,6 +464,11 @@ pub(crate) fn extract_all_from_oxide_document(
460
464
  OutputFormat::Markdown | OutputFormat::Djot | OutputFormat::Html
461
465
  );
462
466
 
467
+ let allow_single_column = config
468
+ .pdf_options
469
+ .as_ref()
470
+ .is_some_and(|o| o.allow_single_column_tables);
471
+
463
472
  let pre_rendered_doc =
464
473
  if needs_structured && !config.force_ocr {
465
474
  let k = config
@@ -503,6 +512,15 @@ pub(crate) fn extract_all_from_oxide_document(
503
512
  used_structure_tree,
504
513
  image_positions: &image_positions,
505
514
  layout_hints,
515
+ allow_single_column,
516
+ #[cfg(feature = "layout-detection")]
517
+ layout_images,
518
+ #[cfg(feature = "layout-detection")]
519
+ layout_results,
520
+ #[cfg(feature = "layout-detection")]
521
+ table_model: config.layout.as_ref().map(|l| l.table_model).unwrap_or_default(),
522
+ #[cfg(feature = "layout-detection")]
523
+ acceleration: config.acceleration.as_ref(),
506
524
  },
507
525
  ) {
508
526
  Ok(structured_doc) if !structured_doc.elements.is_empty() => {
@@ -1192,7 +1192,14 @@ impl PdfExtractor {
1192
1192
  #[cfg(feature = "layout-detection")]
1193
1193
  let layout_bundle = run_layout_detection(content, config);
1194
1194
  #[cfg(feature = "layout-detection")]
1195
- let layout_hints = layout_bundle.as_ref().map(|b| b.hints.as_slice());
1195
+ let (layout_hints, layout_images, layout_results) = match layout_bundle {
1196
+ Some(ref bundle) => (
1197
+ Some(bundle.hints.as_slice()),
1198
+ Some(bundle.images.as_slice()),
1199
+ Some(bundle.results.as_slice()),
1200
+ ),
1201
+ None => (None, None, None),
1202
+ };
1196
1203
  #[cfg(not(feature = "layout-detection"))]
1197
1204
  let layout_hints: Option<&[Vec<crate::pdf::structure::types::LayoutHint>]> = None;
1198
1205
 
@@ -1206,7 +1213,19 @@ impl PdfExtractor {
1206
1213
  pre_rendered_doc,
1207
1214
  _has_font_encoding_issues,
1208
1215
  pdf_annotations,
1209
- ) = extract_all_from_oxide_document(content, config, layout_hints)?;
1216
+ ) = extract_all_from_oxide_document(
1217
+ content,
1218
+ config,
1219
+ layout_hints,
1220
+ #[cfg(feature = "layout-detection")]
1221
+ layout_images,
1222
+ #[cfg(not(feature = "layout-detection"))]
1223
+ None,
1224
+ #[cfg(feature = "layout-detection")]
1225
+ layout_results,
1226
+ #[cfg(not(feature = "layout-detection"))]
1227
+ None,
1228
+ )?;
1210
1229
 
1211
1230
  // --- OCR evaluation (reuses the same logic as the pdfium path) ---
1212
1231
  #[cfg(feature = "ocr")]
@@ -136,7 +136,12 @@ impl DocumentExtractor for PptExtractor {
136
136
  }
137
137
 
138
138
  #[cfg(not(feature = "tokio-runtime"))]
139
- crate::extraction::ppt::extract_ppt_text_with_options(content, include_master_slides)
139
+ {
140
+ if config.cancel_token.as_ref().map(|t| t.is_cancelled()).unwrap_or(false) {
141
+ return Err(crate::error::KreuzbergError::Cancelled);
142
+ }
143
+ crate::extraction::ppt::extract_ppt_text_with_options(content, include_master_slides)
144
+ }
140
145
  }?;
141
146
 
142
147
  let mut metadata_map = AHashMap::new();
@@ -90,28 +90,26 @@ fn find_sentence_end(text: &str, from: usize) -> Option<usize> {
90
90
  }
91
91
 
92
92
  // Look for sentence terminals: . ! ?
93
- match memchr3(b'.', b'!', b'?', &bytes[pos..]) {
94
- None => return None,
95
- Some(offset) => {
96
- let terminal_pos = pos + offset;
97
- // Consume consecutive terminals (e.g., "..." or "?!")
98
- let mut end = terminal_pos + 1;
99
- while end < bytes.len() && (bytes[end] == b'.' || bytes[end] == b'!' || bytes[end] == b'?') {
100
- end += 1;
101
- }
102
-
103
- // Consume closing quotes/brackets after terminal
104
- while end < bytes.len() && matches!(bytes[end], b'"' | b'\'' | b')' | b']' | b'}') {
105
- end += 1;
106
- }
93
+ {
94
+ let offset = memchr3(b'.', b'!', b'?', &bytes[pos..])?;
95
+ let terminal_pos = pos + offset;
96
+ // Consume consecutive terminals (e.g., "..." or "?!")
97
+ let mut end = terminal_pos + 1;
98
+ while end < bytes.len() && (bytes[end] == b'.' || bytes[end] == b'!' || bytes[end] == b'?') {
99
+ end += 1;
100
+ }
107
101
 
108
- // Check if this is a real sentence boundary
109
- if is_sentence_boundary(text, terminal_pos, end) {
110
- return Some(end);
111
- }
102
+ // Consume closing quotes/brackets after terminal
103
+ while end < bytes.len() && matches!(bytes[end], b'"' | b'\'' | b')' | b']' | b'}') {
104
+ end += 1;
105
+ }
112
106
 
113
- pos = end;
107
+ // Check if this is a real sentence boundary
108
+ if is_sentence_boundary(text, terminal_pos, end) {
109
+ return Some(end);
114
110
  }
111
+
112
+ pos = end;
115
113
  }
116
114
  }
117
115
 
@@ -9,8 +9,18 @@ use std::path::{Path, PathBuf};
9
9
  use crate::layout::error::LayoutError;
10
10
  use crate::model_download;
11
11
 
12
+ #[cfg(feature = "paddle-ocr")]
12
13
  use crate::paddle_ocr::ModelManifestEntry;
13
14
 
15
+ #[cfg(not(feature = "paddle-ocr"))]
16
+ #[derive(Debug, Clone, serde::Serialize)]
17
+ pub struct ModelManifestEntry {
18
+ pub relative_path: String,
19
+ pub sha256: String,
20
+ pub size_bytes: u64,
21
+ pub source_url: String,
22
+ }
23
+
14
24
  /// Model definition for a layout model.
15
25
  #[derive(Debug, Clone)]
16
26
  struct ModelDefinition {
@@ -320,12 +320,12 @@ pub fn extract_common_metadata_from_document(document: &PdfDocument<'_>) -> Resu
320
320
  let authors = metadata_cache[2]
321
321
  .as_ref()
322
322
  .map(|author_str| parse_authors(author_str))
323
- .and_then(|parsed| if !parsed.is_empty() { Some(parsed) } else { None });
323
+ .filter(|parsed| !parsed.is_empty());
324
324
 
325
325
  let keywords = metadata_cache[3]
326
326
  .as_ref()
327
327
  .map(|keywords_str| parse_keywords(keywords_str))
328
- .and_then(|parsed| if !parsed.is_empty() { Some(parsed) } else { None });
328
+ .filter(|parsed| !parsed.is_empty());
329
329
 
330
330
  let created_at = metadata_cache[4].as_ref().map(|date_str| parse_pdf_date(date_str));
331
331
 
@@ -94,11 +94,11 @@ fn extract_common_metadata(doc: &mut OxideDocument) -> Result<CommonPdfMetadata>
94
94
 
95
95
  let authors = get_info_string(&mut doc.doc, "Author")
96
96
  .map(|author_str| parse_authors(&author_str))
97
- .and_then(|parsed| if parsed.is_empty() { None } else { Some(parsed) });
97
+ .filter(|parsed| !parsed.is_empty());
98
98
 
99
99
  let keywords = get_info_string(&mut doc.doc, "Keywords")
100
100
  .map(|kw_str| parse_keywords(&kw_str))
101
- .and_then(|parsed| if parsed.is_empty() { None } else { Some(parsed) });
101
+ .filter(|parsed| !parsed.is_empty());
102
102
 
103
103
  let created_at = get_info_string(&mut doc.doc, "CreationDate").map(|d| parse_pdf_date(&d));
104
104
  let modified_at = get_info_string(&mut doc.doc, "ModDate").map(|d| parse_pdf_date(&d));
@@ -75,9 +75,7 @@ pub(crate) fn extract_tables_native(doc: &mut OxideDocument) -> Result<Vec<Table
75
75
  ///
76
76
  /// Maps rows/cells from the native table structure to a 2D `Vec<Vec<String>>`
77
77
  /// grid and builds a markdown representation with proper header separators.
78
- fn convert_extracted_table(
79
- table: &pdf_oxide::structure::table_extractor::ExtractedTable,
80
- ) -> (Vec<Vec<String>>, String) {
78
+ fn convert_extracted_table(table: &pdf_oxide::structure::table_extractor::Table) -> (Vec<Vec<String>>, String) {
81
79
  let mut cells: Vec<Vec<String>> = Vec::with_capacity(table.rows.len());
82
80
  let mut markdown = String::new();
83
81
  let mut found_header = false;
@@ -124,7 +122,7 @@ mod tests {
124
122
 
125
123
  #[test]
126
124
  fn test_convert_extracted_table_basic() {
127
- use pdf_oxide::structure::table_extractor::{ExtractedTable, TableCell, TableRow};
125
+ use pdf_oxide::structure::table_extractor::{Table as ExtractedTable, TableCell, TableRow};
128
126
 
129
127
  let table = ExtractedTable {
130
128
  rows: vec![
@@ -191,7 +189,7 @@ mod tests {
191
189
 
192
190
  #[test]
193
191
  fn test_convert_extracted_table_no_header() {
194
- use pdf_oxide::structure::table_extractor::{ExtractedTable, TableCell, TableRow};
192
+ use pdf_oxide::structure::table_extractor::{Table as ExtractedTable, TableCell, TableRow};
195
193
 
196
194
  let table = ExtractedTable {
197
195
  rows: vec![
@@ -233,7 +231,7 @@ mod tests {
233
231
 
234
232
  #[test]
235
233
  fn test_convert_extracted_table_empty() {
236
- use pdf_oxide::structure::table_extractor::ExtractedTable;
234
+ use pdf_oxide::structure::table_extractor::Table as ExtractedTable;
237
235
 
238
236
  let table = ExtractedTable {
239
237
  rows: vec![],