kreuzberg 4.0.0.pre.rc.6 β†’ 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -47,6 +47,24 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
47
47
  let _ = crate::keywords::ensure_initialized();
48
48
  }
49
49
 
50
+ #[cfg(feature = "language-detection")]
51
+ {
52
+ let _ = crate::language_detection::ensure_initialized();
53
+ }
54
+
55
+ #[cfg(feature = "chunking")]
56
+ {
57
+ let _ = crate::chunking::ensure_initialized();
58
+ }
59
+
60
+ #[cfg(feature = "quality")]
61
+ {
62
+ let registry = crate::plugins::registry::get_post_processor_registry();
63
+ if let Ok(mut reg) = registry.write() {
64
+ let _ = reg.register(std::sync::Arc::new(crate::text::QualityProcessor), 30);
65
+ }
66
+ }
67
+
50
68
  let processor_registry = crate::plugins::registry::get_post_processor_registry();
51
69
 
52
70
  for stage in [ProcessingStage::Early, ProcessingStage::Middle, ProcessingStage::Late] {
@@ -130,7 +148,9 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
130
148
  chunker_type: crate::chunking::ChunkerType::Text,
131
149
  };
132
150
 
133
- match crate::chunking::chunk_text(&result.content, &chunk_config) {
151
+ let page_boundaries = result.metadata.pages.as_ref().and_then(|ps| ps.boundaries.as_deref());
152
+
153
+ match crate::chunking::chunk_text(&result.content, &chunk_config, page_boundaries) {
134
154
  Ok(chunking_result) => {
135
155
  result.chunks = Some(chunking_result.chunks);
136
156
 
@@ -228,6 +248,157 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
228
248
  Ok(result)
229
249
  }
230
250
 
251
+ /// Run the post-processing pipeline synchronously (WASM-compatible version).
252
+ ///
253
+ /// This is a synchronous implementation for WASM and non-async contexts.
254
+ /// It performs a subset of the full async pipeline, excluding async post-processors
255
+ /// and validators.
256
+ ///
257
+ /// # Arguments
258
+ ///
259
+ /// * `result` - The extraction result to process
260
+ /// * `config` - Extraction configuration
261
+ ///
262
+ /// # Returns
263
+ ///
264
+ /// The processed extraction result.
265
+ ///
266
+ /// # Notes
267
+ ///
268
+ /// This function is only available when the `tokio-runtime` feature is disabled.
269
+ /// It handles:
270
+ /// - Quality processing (if enabled)
271
+ /// - Chunking (if enabled)
272
+ /// - Language detection (if enabled)
273
+ ///
274
+ /// It does NOT handle:
275
+ /// - Async post-processors
276
+ /// - Async validators
277
+ #[cfg(not(feature = "tokio-runtime"))]
278
+ pub fn run_pipeline_sync(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
279
+ // Quality processing
280
+ #[cfg(feature = "quality")]
281
+ if config.enable_quality_processing {
282
+ let quality_score = crate::text::quality::calculate_quality_score(
283
+ &result.content,
284
+ Some(
285
+ &result
286
+ .metadata
287
+ .additional
288
+ .iter()
289
+ .map(|(k, v)| (k.clone(), v.to_string()))
290
+ .collect(),
291
+ ),
292
+ );
293
+ result.metadata.additional.insert(
294
+ "quality_score".to_string(),
295
+ serde_json::Value::Number(
296
+ serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
297
+ ),
298
+ );
299
+ }
300
+
301
+ #[cfg(not(feature = "quality"))]
302
+ if config.enable_quality_processing {
303
+ result.metadata.additional.insert(
304
+ "quality_processing_error".to_string(),
305
+ serde_json::Value::String("Quality processing feature not enabled".to_string()),
306
+ );
307
+ }
308
+
309
+ // Chunking
310
+ #[cfg(feature = "chunking")]
311
+ if let Some(ref chunking_config) = config.chunking {
312
+ let chunk_config = crate::chunking::ChunkingConfig {
313
+ max_characters: chunking_config.max_chars,
314
+ overlap: chunking_config.max_overlap,
315
+ trim: true,
316
+ chunker_type: crate::chunking::ChunkerType::Text,
317
+ };
318
+
319
+ match crate::chunking::chunk_text(&result.content, &chunk_config, None) {
320
+ Ok(chunking_result) => {
321
+ result.chunks = Some(chunking_result.chunks);
322
+
323
+ if let Some(ref chunks) = result.chunks {
324
+ result.metadata.additional.insert(
325
+ "chunk_count".to_string(),
326
+ serde_json::Value::Number(serde_json::Number::from(chunks.len())),
327
+ );
328
+ }
329
+
330
+ #[cfg(feature = "embeddings")]
331
+ if let Some(ref embedding_config) = chunking_config.embedding
332
+ && let Some(ref mut chunks) = result.chunks
333
+ {
334
+ match crate::embeddings::generate_embeddings_for_chunks(chunks, embedding_config) {
335
+ Ok(()) => {
336
+ result
337
+ .metadata
338
+ .additional
339
+ .insert("embeddings_generated".to_string(), serde_json::Value::Bool(true));
340
+ }
341
+ Err(e) => {
342
+ result
343
+ .metadata
344
+ .additional
345
+ .insert("embedding_error".to_string(), serde_json::Value::String(e.to_string()));
346
+ }
347
+ }
348
+ }
349
+
350
+ #[cfg(not(feature = "embeddings"))]
351
+ if chunking_config.embedding.is_some() {
352
+ result.metadata.additional.insert(
353
+ "embedding_error".to_string(),
354
+ serde_json::Value::String("Embeddings feature not enabled".to_string()),
355
+ );
356
+ }
357
+ }
358
+ Err(e) => {
359
+ result
360
+ .metadata
361
+ .additional
362
+ .insert("chunking_error".to_string(), serde_json::Value::String(e.to_string()));
363
+ }
364
+ }
365
+ }
366
+
367
+ #[cfg(not(feature = "chunking"))]
368
+ if config.chunking.is_some() {
369
+ result.metadata.additional.insert(
370
+ "chunking_error".to_string(),
371
+ serde_json::Value::String("Chunking feature not enabled".to_string()),
372
+ );
373
+ }
374
+
375
+ // Language detection
376
+ #[cfg(feature = "language-detection")]
377
+ if let Some(ref lang_config) = config.language_detection {
378
+ match crate::language_detection::detect_languages(&result.content, lang_config) {
379
+ Ok(detected) => {
380
+ result.detected_languages = detected;
381
+ }
382
+ Err(e) => {
383
+ result.metadata.additional.insert(
384
+ "language_detection_error".to_string(),
385
+ serde_json::Value::String(e.to_string()),
386
+ );
387
+ }
388
+ }
389
+ }
390
+
391
+ #[cfg(not(feature = "language-detection"))]
392
+ if config.language_detection.is_some() {
393
+ result.metadata.additional.insert(
394
+ "language_detection_error".to_string(),
395
+ serde_json::Value::String("Language detection feature not enabled".to_string()),
396
+ );
397
+ }
398
+
399
+ Ok(result)
400
+ }
401
+
231
402
  #[cfg(test)]
232
403
  mod tests {
233
404
  use super::*;
@@ -235,6 +406,7 @@ mod tests {
235
406
  use lazy_static::lazy_static;
236
407
 
237
408
  const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
409
+ #[cfg(feature = "quality")]
238
410
  const QUALITY_VALIDATION_MARKER: &str = "quality_validation_test";
239
411
  const POSTPROCESSOR_VALIDATION_MARKER: &str = "postprocessor_validation_test";
240
412
  const ORDER_VALIDATION_MARKER: &str = "order_validation_test";
@@ -253,6 +425,7 @@ mod tests {
253
425
  detected_languages: None,
254
426
  chunks: None,
255
427
  images: None,
428
+ pages: None,
256
429
  };
257
430
  result.metadata.additional.insert(
258
431
  VALIDATION_MARKER_KEY.to_string(),
@@ -275,6 +448,7 @@ mod tests {
275
448
  detected_languages: None,
276
449
  chunks: None,
277
450
  images: None,
451
+ pages: None,
278
452
  };
279
453
  let config = ExtractionConfig {
280
454
  enable_quality_processing: true,
@@ -295,6 +469,7 @@ mod tests {
295
469
  detected_languages: None,
296
470
  chunks: None,
297
471
  images: None,
472
+ pages: None,
298
473
  };
299
474
  let config = ExtractionConfig {
300
475
  enable_quality_processing: false,
@@ -316,6 +491,7 @@ mod tests {
316
491
  detected_languages: None,
317
492
  chunks: None,
318
493
  images: None,
494
+ pages: None,
319
495
  };
320
496
  let config = ExtractionConfig {
321
497
  chunking: Some(crate::ChunkingConfig {
@@ -343,6 +519,7 @@ mod tests {
343
519
  detected_languages: None,
344
520
  chunks: None,
345
521
  images: None,
522
+ pages: None,
346
523
  };
347
524
  let config = ExtractionConfig {
348
525
  chunking: None,
@@ -367,6 +544,7 @@ mod tests {
367
544
  additional,
368
545
  ..Default::default()
369
546
  },
547
+ pages: None,
370
548
  tables: vec![],
371
549
  detected_languages: None,
372
550
  chunks: None,
@@ -403,6 +581,7 @@ mod tests {
403
581
  detected_languages: None,
404
582
  chunks: None,
405
583
  images: None,
584
+ pages: None,
406
585
  };
407
586
  let config = ExtractionConfig::default();
408
587
 
@@ -432,6 +611,7 @@ mod tests {
432
611
  detected_languages: None,
433
612
  chunks: None,
434
613
  images: None,
614
+ pages: None,
435
615
  };
436
616
  let config = ExtractionConfig::default();
437
617
 
@@ -452,6 +632,7 @@ mod tests {
452
632
  detected_languages: None,
453
633
  chunks: None,
454
634
  images: None,
635
+ pages: None,
455
636
  };
456
637
  let config = ExtractionConfig {
457
638
  enable_quality_processing: true,
@@ -488,6 +669,7 @@ Natural language processing enables computers to understand human language.
488
669
  detected_languages: None,
489
670
  chunks: None,
490
671
  images: None,
672
+ pages: None,
491
673
  };
492
674
 
493
675
  #[cfg(feature = "keywords-yake")]
@@ -529,6 +711,7 @@ Natural language processing enables computers to understand human language.
529
711
  detected_languages: None,
530
712
  chunks: None,
531
713
  images: None,
714
+ pages: None,
532
715
  };
533
716
 
534
717
  let config = ExtractionConfig {
@@ -564,6 +747,7 @@ Natural language processing enables computers to understand human language.
564
747
  detected_languages: None,
565
748
  chunks: None,
566
749
  images: None,
750
+ pages: None,
567
751
  };
568
752
 
569
753
  #[cfg(feature = "keywords-yake")]
@@ -693,6 +877,7 @@ Natural language processing enables computers to understand human language.
693
877
  detected_languages: None,
694
878
  chunks: None,
695
879
  images: None,
880
+ pages: None,
696
881
  };
697
882
  result.metadata.additional.insert(
698
883
  VALIDATION_MARKER_KEY.to_string(),
@@ -778,6 +963,7 @@ Natural language processing enables computers to understand human language.
778
963
  detected_languages: None,
779
964
  chunks: None,
780
965
  images: None,
966
+ pages: None,
781
967
  };
782
968
  result.metadata.additional.insert(
783
969
  VALIDATION_MARKER_KEY.to_string(),
@@ -969,6 +1155,7 @@ Natural language processing enables computers to understand human language.
969
1155
  detected_languages: None,
970
1156
  chunks: None,
971
1157
  images: None,
1158
+ pages: None,
972
1159
  };
973
1160
 
974
1161
  let config = ExtractionConfig::default();
@@ -2,8 +2,13 @@
2
2
  //!
3
3
  //! This module provides high-performance text extraction from DOCX files using the docx-lite
4
4
  //! library, which uses streaming XML parsing for efficiency.
5
+ //!
6
+ //! Page break detection is best-effort, detecting only explicit page breaks (`<w:br w:type="page"/>`)
7
+ //! in the document XML. This does not account for automatic pagination based on content reflowing.
5
8
 
6
9
  use crate::error::{KreuzbergError, Result};
10
+ use crate::types::PageBoundary;
11
+ use std::io::Cursor;
7
12
 
8
13
  /// Extract text from DOCX bytes using docx-lite.
9
14
  ///
@@ -22,6 +27,163 @@ pub fn extract_text(bytes: &[u8]) -> Result<String> {
22
27
  .map_err(|e| KreuzbergError::parsing(format!("DOCX text extraction failed: {}", e)))
23
28
  }
24
29
 
30
+ /// Extract text and page boundaries from DOCX bytes.
31
+ ///
32
+ /// Detects explicit page breaks (`<w:br w:type="page"/>`) in the document XML and maps them to
33
+ /// character offsets in the extracted text. This is a best-effort approach that only detects
34
+ /// explicit page breaks, not automatic pagination.
35
+ ///
36
+ /// # Arguments
37
+ /// * `bytes` - The DOCX file contents as bytes
38
+ ///
39
+ /// # Returns
40
+ /// * `Ok((String, Option<Vec<PageBoundary>>))` - Extracted text and optional page boundaries
41
+ /// * `Err(KreuzbergError)` - If extraction fails
42
+ ///
43
+ /// # Limitations
44
+ /// - Only detects explicit page breaks, not reflowed content
45
+ /// - Page numbers are estimates, not guaranteed accurate
46
+ /// - Word's pagination may differ from detected breaks
47
+ /// - No page dimensions available (would require layout engine)
48
+ ///
49
+ /// # Performance
50
+ /// Performs two passes: one with docx-lite for text extraction and one for page break detection.
51
+ pub fn extract_text_with_page_breaks(bytes: &[u8]) -> Result<(String, Option<Vec<PageBoundary>>)> {
52
+ let text = extract_text(bytes)?;
53
+
54
+ let page_breaks = detect_page_breaks(bytes)?;
55
+
56
+ if page_breaks.is_empty() {
57
+ return Ok((text, None));
58
+ }
59
+
60
+ let boundaries = map_page_breaks_to_boundaries(&text, page_breaks)?;
61
+
62
+ Ok((text, Some(boundaries)))
63
+ }
64
+
65
+ /// Detect explicit page break positions in document.xml and extract full text with page boundaries.
66
+ ///
67
+ /// This is a convenience function for the extractor that combines text extraction with page
68
+ /// break detection. It returns the extracted text along with page boundaries.
69
+ ///
70
+ /// # Arguments
71
+ /// * `bytes` - The DOCX file contents (ZIP archive)
72
+ ///
73
+ /// # Returns
74
+ /// * `Ok(Option<Vec<PageBoundary>>)` - Optional page boundaries
75
+ /// * `Err(KreuzbergError)` - If extraction fails
76
+ ///
77
+ /// # Limitations
78
+ /// - Only detects explicit page breaks, not reflowed content
79
+ /// - Page numbers are estimates based on detected breaks
80
+ pub fn detect_page_breaks_from_docx(bytes: &[u8]) -> Result<Option<Vec<PageBoundary>>> {
81
+ match extract_text_with_page_breaks(bytes) {
82
+ Ok((_, boundaries)) => Ok(boundaries),
83
+ Err(e) => {
84
+ tracing::debug!("Page break detection failed: {}", e);
85
+ Ok(None)
86
+ }
87
+ }
88
+ }
89
+
90
+ /// Detect explicit page break positions in document.xml.
91
+ ///
92
+ /// Returns a vector of byte offsets within the document.xml content where page breaks occur.
93
+ /// These offsets will later be mapped to character positions in the extracted text.
94
+ ///
95
+ /// # Arguments
96
+ /// * `bytes` - The DOCX file contents (ZIP archive)
97
+ ///
98
+ /// # Returns
99
+ /// * `Ok(Vec<usize>)` - Vector of detected page break byte offsets (empty if none found)
100
+ /// * `Err(KreuzbergError)` - If ZIP/XML parsing fails
101
+ fn detect_page_breaks(bytes: &[u8]) -> Result<Vec<usize>> {
102
+ use zip::ZipArchive;
103
+
104
+ let cursor = Cursor::new(bytes);
105
+ let mut archive =
106
+ ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to open DOCX as ZIP: {}", e)))?;
107
+
108
+ let document_xml = match archive.by_name("word/document.xml") {
109
+ Ok(mut file) => {
110
+ let mut content = String::new();
111
+ std::io::Read::read_to_string(&mut file, &mut content)
112
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to read document.xml: {}", e)))?;
113
+ content
114
+ }
115
+ Err(_) => return Ok(Vec::new()),
116
+ };
117
+
118
+ let mut breaks = Vec::new();
119
+ let search_pattern = r#"<w:br w:type="page"/>"#;
120
+
121
+ for (idx, _) in document_xml.match_indices(search_pattern) {
122
+ breaks.push(idx);
123
+ }
124
+
125
+ Ok(breaks)
126
+ }
127
+
128
+ /// Map detected page break positions to byte boundaries in extracted text.
129
+ ///
130
+ /// Since we don't have a precise mapping between document.xml byte positions and final text
131
+ /// character positions, we use a heuristic: divide the text roughly equally between detected breaks.
132
+ /// This is best-effort and may not perfectly match Word's pagination.
133
+ ///
134
+ /// # LIMITATION
135
+ /// This is a best-effort heuristic that distributes content evenly across detected page breaks.
136
+ /// It does not account for actual page layout, varying page sizes, or Word's pagination logic.
137
+ /// Use with caution. The function correctly handles multibyte UTF-8 characters (emoji, CJK, etc.)
138
+ /// by working with character indices rather than byte indices.
139
+ ///
140
+ /// # Arguments
141
+ /// * `text` - The extracted document text
142
+ /// * `page_breaks` - Vector of detected page break positions (unused, but kept for extension)
143
+ ///
144
+ /// # Returns
145
+ /// * `Ok(Vec<PageBoundary>)` - Byte boundaries for each page
146
+ fn map_page_breaks_to_boundaries(text: &str, page_breaks: Vec<usize>) -> Result<Vec<PageBoundary>> {
147
+ if page_breaks.is_empty() {
148
+ return Ok(Vec::new());
149
+ }
150
+
151
+ let page_count = page_breaks.len() + 1;
152
+
153
+ let char_count = text.chars().count();
154
+ let chars_per_page = char_count / page_count;
155
+
156
+ let mut boundaries = Vec::new();
157
+ let mut byte_offset = 0;
158
+
159
+ for page_num in 1..=page_count {
160
+ let start = byte_offset;
161
+
162
+ let end = if page_num == page_count {
163
+ text.len()
164
+ } else {
165
+ let remaining = &text[byte_offset..];
166
+ let chars_to_skip = chars_per_page;
167
+ byte_offset
168
+ + remaining
169
+ .chars()
170
+ .take(chars_to_skip)
171
+ .map(|c| c.len_utf8())
172
+ .sum::<usize>()
173
+ };
174
+
175
+ byte_offset = end;
176
+
177
+ boundaries.push(PageBoundary {
178
+ byte_start: start,
179
+ byte_end: end,
180
+ page_number: page_num,
181
+ });
182
+ }
183
+
184
+ Ok(boundaries)
185
+ }
186
+
25
187
  #[cfg(test)]
26
188
  mod tests {
27
189
  use super::*;
@@ -37,4 +199,200 @@ mod tests {
37
199
  let result = extract_text(b"not a docx file");
38
200
  assert!(result.is_err());
39
201
  }
202
+
203
+ #[test]
204
+ fn test_map_page_breaks_to_boundaries_empty() {
205
+ let result = map_page_breaks_to_boundaries("test text", Vec::new()).unwrap();
206
+ assert!(result.is_empty());
207
+ }
208
+
209
+ #[test]
210
+ fn test_map_page_breaks_to_boundaries_single_break() {
211
+ let text = "Page 1 content here with some text.Page 2 content here with more text.";
212
+ let breaks = vec![0];
213
+
214
+ let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
215
+
216
+ assert_eq!(result.len(), 2);
217
+ assert_eq!(result[0].page_number, 1);
218
+ assert_eq!(result[0].byte_start, 0);
219
+ assert!(result[0].byte_end > 0);
220
+ assert!(result[0].byte_end < text.len());
221
+
222
+ assert_eq!(result[1].page_number, 2);
223
+ assert_eq!(result[1].byte_start, result[0].byte_end);
224
+ assert_eq!(result[1].byte_end, text.len());
225
+ }
226
+
227
+ #[test]
228
+ fn test_map_page_breaks_to_boundaries_multiple_breaks() {
229
+ let text = "A".repeat(300);
230
+ let breaks = vec![0, 0, 0];
231
+
232
+ let result = map_page_breaks_to_boundaries(&text, breaks).unwrap();
233
+
234
+ assert_eq!(result.len(), 4);
235
+ assert_eq!(result[0].page_number, 1);
236
+ assert_eq!(result[3].page_number, 4);
237
+ assert_eq!(result[3].byte_end, text.len());
238
+
239
+ for i in 0..result.len() - 1 {
240
+ assert_eq!(result[i].byte_end, result[i + 1].byte_start);
241
+ }
242
+ }
243
+
244
+ #[test]
245
+ fn test_map_page_breaks_to_boundaries_utf8_boundary() {
246
+ let text = "Hello world! γ“γ‚“γ«γ‘γ―δΈ–η•ŒοΌ More text here.";
247
+ let breaks = vec![0];
248
+
249
+ let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
250
+
251
+ assert_eq!(result.len(), 2);
252
+ assert!(text.is_char_boundary(result[0].byte_start));
253
+ assert!(text.is_char_boundary(result[0].byte_end));
254
+ assert!(text.is_char_boundary(result[1].byte_start));
255
+ assert!(text.is_char_boundary(result[1].byte_end));
256
+ }
257
+
258
+ #[test]
259
+ fn test_docx_page_breaks_with_emoji() {
260
+ let text = "Hello πŸ˜€ World 🌍 Foo πŸŽ‰ Bar";
261
+ let breaks = vec![0, 0];
262
+
263
+ let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
264
+
265
+ assert_eq!(result.len(), 3);
266
+ assert_eq!(result[0].page_number, 1);
267
+ assert_eq!(result[1].page_number, 2);
268
+ assert_eq!(result[2].page_number, 3);
269
+
270
+ for boundary in &result {
271
+ assert!(
272
+ text.is_char_boundary(boundary.byte_start),
273
+ "byte_start {} is not a valid UTF-8 boundary",
274
+ boundary.byte_start
275
+ );
276
+ assert!(
277
+ text.is_char_boundary(boundary.byte_end),
278
+ "byte_end {} is not a valid UTF-8 boundary",
279
+ boundary.byte_end
280
+ );
281
+ }
282
+
283
+ assert_eq!(result[0].byte_start, 0);
284
+ assert_eq!(result[0].byte_end, result[1].byte_start);
285
+ assert_eq!(result[1].byte_end, result[2].byte_start);
286
+ assert_eq!(result[2].byte_end, text.len());
287
+
288
+ let reconstructed = format!(
289
+ "{}{}{}",
290
+ &text[result[0].byte_start..result[0].byte_end],
291
+ &text[result[1].byte_start..result[1].byte_end],
292
+ &text[result[2].byte_start..result[2].byte_end]
293
+ );
294
+ assert_eq!(reconstructed, text);
295
+ }
296
+
297
+ #[test]
298
+ fn test_docx_page_breaks_with_cjk() {
299
+ let text = "δ½ ε₯½δΈ–η•Œδ½ ε₯½δΈ–η•Œδ½ ε₯½δΈ–η•Œδ½ ε₯½δΈ–η•Œ";
300
+ let breaks = vec![0];
301
+
302
+ let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
303
+
304
+ assert_eq!(result.len(), 2);
305
+ assert_eq!(result[0].page_number, 1);
306
+ assert_eq!(result[1].page_number, 2);
307
+
308
+ for boundary in &result {
309
+ assert!(
310
+ text.is_char_boundary(boundary.byte_start),
311
+ "byte_start {} is not a valid UTF-8 boundary",
312
+ boundary.byte_start
313
+ );
314
+ assert!(
315
+ text.is_char_boundary(boundary.byte_end),
316
+ "byte_end {} is not a valid UTF-8 boundary",
317
+ boundary.byte_end
318
+ );
319
+ }
320
+
321
+ assert_eq!(result[0].byte_start, 0);
322
+ assert_eq!(result[0].byte_end, result[1].byte_start);
323
+ assert_eq!(result[1].byte_end, text.len());
324
+
325
+ let reconstructed = format!(
326
+ "{}{}",
327
+ &text[result[0].byte_start..result[0].byte_end],
328
+ &text[result[1].byte_start..result[1].byte_end]
329
+ );
330
+ assert_eq!(reconstructed, text);
331
+ }
332
+
333
+ #[test]
334
+ fn test_docx_page_breaks_multibyte_utf8() {
335
+ let text = "ASCII πŸ˜€ δΈ­ζ–‡ hello πŸŽ‰ world ζ—₯本θͺž";
336
+ let breaks = vec![0, 0];
337
+
338
+ let result = map_page_breaks_to_boundaries(text, breaks).unwrap();
339
+
340
+ assert_eq!(result.len(), 3);
341
+
342
+ for boundary in &result {
343
+ assert!(
344
+ text.is_char_boundary(boundary.byte_start),
345
+ "byte_start {} is not a valid UTF-8 boundary",
346
+ boundary.byte_start
347
+ );
348
+ assert!(
349
+ text.is_char_boundary(boundary.byte_end),
350
+ "byte_end {} is not a valid UTF-8 boundary",
351
+ boundary.byte_end
352
+ );
353
+ }
354
+
355
+ assert_eq!(result[0].byte_start, 0);
356
+ for i in 0..result.len() - 1 {
357
+ assert_eq!(
358
+ result[i].byte_end,
359
+ result[i + 1].byte_start,
360
+ "Gap or overlap between page {} and {}",
361
+ i + 1,
362
+ i + 2
363
+ );
364
+ }
365
+ assert_eq!(
366
+ result[result.len() - 1].byte_end,
367
+ text.len(),
368
+ "Last page does not end at text boundary"
369
+ );
370
+
371
+ let mut reconstructed = String::new();
372
+ for boundary in &result {
373
+ reconstructed.push_str(&text[boundary.byte_start..boundary.byte_end]);
374
+ }
375
+ assert_eq!(reconstructed, text);
376
+ }
377
+
378
+ #[test]
379
+ fn test_detect_page_breaks_no_feature() {
380
+ let result = detect_page_breaks(b"invalid");
381
+ assert!(result.is_err());
382
+ }
383
+
384
+ #[test]
385
+ fn test_extract_text_with_page_breaks_no_breaks() {
386
+ let docx_path =
387
+ std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/documents/lorem_ipsum.docx");
388
+ if let Ok(bytes) = std::fs::read(docx_path) {
389
+ let result = extract_text_with_page_breaks(&bytes);
390
+ if let Ok((text, boundaries)) = result {
391
+ assert!(!text.is_empty());
392
+ if let Some(b) = boundaries {
393
+ assert!(!b.is_empty());
394
+ }
395
+ }
396
+ }
397
+ }
40
398
  }