kreuzberg 4.2.11 → 4.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +24 -9
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
- data/vendor/kreuzberg/src/core/mime.rs +47 -2
- data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
- data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
- data/vendor/kreuzberg/src/extraction/{docx.rs → docx/mod.rs} +7 -17
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +686 -0
- data/vendor/kreuzberg/src/extraction/image.rs +405 -18
- data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
- data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
- data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
- data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +10 -22
- data/vendor/kreuzberg/src/extractors/image.rs +25 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
- data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
- data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
- data/vendor/kreuzberg/src/extractors/security.rs +2 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
- data/vendor/kreuzberg/src/extractors/text.rs +33 -4
- data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
- data/vendor/kreuzberg/tests/issue_359_list_whitespace_test.rs +33 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +7 -3
|
@@ -1,31 +1,21 @@
|
|
|
1
|
-
//! DOCX (Microsoft Word) text extraction
|
|
1
|
+
//! DOCX (Microsoft Word) text extraction.
|
|
2
2
|
//!
|
|
3
|
-
//! This module provides high-performance text extraction from DOCX files using
|
|
4
|
-
//!
|
|
3
|
+
//! This module provides high-performance text extraction from DOCX files using
|
|
4
|
+
//! streaming XML parsing for efficiency.
|
|
5
5
|
//!
|
|
6
6
|
//! Page break detection is best-effort, detecting only explicit page breaks (`<w:br w:type="page"/>`)
|
|
7
7
|
//! in the document XML. This does not account for automatic pagination based on content reflowing.
|
|
8
8
|
|
|
9
|
+
pub mod parser;
|
|
10
|
+
|
|
9
11
|
use crate::error::{KreuzbergError, Result};
|
|
10
12
|
use crate::extraction::capacity;
|
|
11
13
|
use crate::types::PageBoundary;
|
|
12
14
|
use std::io::Cursor;
|
|
13
15
|
|
|
14
|
-
/// Extract text from DOCX bytes
|
|
15
|
-
///
|
|
16
|
-
/// # Arguments
|
|
17
|
-
/// * `bytes` - The DOCX file contents as bytes
|
|
18
|
-
///
|
|
19
|
-
/// # Returns
|
|
20
|
-
/// * `Ok(String)` - The extracted text content
|
|
21
|
-
/// * `Err(KreuzbergError)` - If extraction fails
|
|
22
|
-
///
|
|
23
|
-
/// # Performance
|
|
24
|
-
/// docx-lite uses streaming XML parsing for minimal memory overhead and high throughput
|
|
25
|
-
/// (~160 MB/s average).
|
|
16
|
+
/// Extract text from DOCX bytes.
|
|
26
17
|
pub fn extract_text(bytes: &[u8]) -> Result<String> {
|
|
27
|
-
|
|
28
|
-
.map_err(|e| KreuzbergError::parsing(format!("DOCX text extraction failed: {}", e)))
|
|
18
|
+
parser::extract_text_from_bytes(bytes)
|
|
29
19
|
}
|
|
30
20
|
|
|
31
21
|
/// Extract text and page boundaries from DOCX bytes.
|