kreuzberg 4.2.11 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +2 -2
  7. data/vendor/kreuzberg/Cargo.toml +24 -9
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
  10. data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
  11. data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
  12. data/vendor/kreuzberg/src/core/mime.rs +47 -2
  13. data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
  14. data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
  15. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
  16. data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
  17. data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
  18. data/vendor/kreuzberg/src/extraction/{docx.rs → docx/mod.rs} +7 -17
  19. data/vendor/kreuzberg/src/extraction/docx/parser.rs +686 -0
  20. data/vendor/kreuzberg/src/extraction/image.rs +405 -18
  21. data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
  22. data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
  23. data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
  24. data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
  25. data/vendor/kreuzberg/src/extractors/docx.rs +10 -22
  26. data/vendor/kreuzberg/src/extractors/image.rs +25 -0
  27. data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
  28. data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
  29. data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
  30. data/vendor/kreuzberg/src/extractors/security.rs +2 -1
  31. data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
  32. data/vendor/kreuzberg/src/extractors/text.rs +33 -4
  33. data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
  34. data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
  35. data/vendor/kreuzberg/tests/issue_359_list_whitespace_test.rs +33 -0
  36. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  37. metadata +7 -3
@@ -1,31 +1,21 @@
1
- //! DOCX (Microsoft Word) text extraction using docx-lite.
1
+ //! DOCX (Microsoft Word) text extraction.
2
2
  //!
3
- //! This module provides high-performance text extraction from DOCX files using the docx-lite
4
- //! library, which uses streaming XML parsing for efficiency.
3
+ //! This module provides high-performance text extraction from DOCX files using
4
+ //! streaming XML parsing for efficiency.
5
5
  //!
6
6
  //! Page break detection is best-effort, detecting only explicit page breaks (`<w:br w:type="page"/>`)
7
7
  //! in the document XML. This does not account for automatic pagination based on content reflowing.
8
8
 
9
+ pub mod parser;
10
+
9
11
  use crate::error::{KreuzbergError, Result};
10
12
  use crate::extraction::capacity;
11
13
  use crate::types::PageBoundary;
12
14
  use std::io::Cursor;
13
15
 
14
- /// Extract text from DOCX bytes using docx-lite.
15
- ///
16
- /// # Arguments
17
- /// * `bytes` - The DOCX file contents as bytes
18
- ///
19
- /// # Returns
20
- /// * `Ok(String)` - The extracted text content
21
- /// * `Err(KreuzbergError)` - If extraction fails
22
- ///
23
- /// # Performance
24
- /// docx-lite uses streaming XML parsing for minimal memory overhead and high throughput
25
- /// (~160 MB/s average).
16
+ /// Extract text from DOCX bytes.
26
17
  pub fn extract_text(bytes: &[u8]) -> Result<String> {
27
- docx_lite::extract_text_from_bytes(bytes)
28
- .map_err(|e| KreuzbergError::parsing(format!("DOCX text extraction failed: {}", e)))
18
+ parser::extract_text_from_bytes(bytes)
29
19
  }
30
20
 
31
21
  /// Extract text and page boundaries from DOCX bytes.