kreuzberg 4.2.10 → 4.2.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +6 -8
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/extractor/batch.rs +29 -8
- data/vendor/kreuzberg/src/extraction/{docx.rs → docx/mod.rs} +7 -17
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +686 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +10 -22
- data/vendor/kreuzberg/src/types/metadata.rs +7 -0
- data/vendor/kreuzberg/tests/api_embed.rs +5 -0
- data/vendor/kreuzberg/tests/issue_359_list_whitespace_test.rs +33 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +5 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 869101ec7a3d0814c2baed8606879024e94880ead0003d441a16199d25fd3a16
|
|
4
|
+
data.tar.gz: 9852c4c51345f362095306f40d910c2e1d4ae19f385754d4c8d3a960123f96ee
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 579be7645c2f406ce8e7c4cc85ed511edb7c5879bc8674fe9ec4eb4375cf240113968f0ce35747bd10fae52ff5849df4616d8c6db299db609696da24b2700fff
|
|
7
|
+
data.tar.gz: adc5969c0480739fd57a6bdb832db435d64e16ee0b2827197f05fcf4f52a7e7c03affd704b4c1064df49391bb105c4166531b8c4ba96538906b38f863744c843
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.12)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -123,7 +123,7 @@ GEM
|
|
|
123
123
|
rubocop (~> 1.81)
|
|
124
124
|
ruby-progressbar (1.13.0)
|
|
125
125
|
securerandom (0.4.1)
|
|
126
|
-
sorbet-runtime (0.6.
|
|
126
|
+
sorbet-runtime (0.6.12925)
|
|
127
127
|
steep (1.10.0)
|
|
128
128
|
activesupport (>= 5.1)
|
|
129
129
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -209,7 +209,7 @@ CHECKSUMS
|
|
|
209
209
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
210
210
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
211
211
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
212
|
-
kreuzberg (4.2.
|
|
212
|
+
kreuzberg (4.2.12)
|
|
213
213
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
214
214
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
215
215
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -244,7 +244,7 @@ CHECKSUMS
|
|
|
244
244
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
245
245
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
246
246
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
247
|
-
sorbet-runtime (0.6.
|
|
247
|
+
sorbet-runtime (0.6.12925) sha256=ddd6fb1d8aaf6bc19119ffadbc4b96536f3d6766fa82059112dacb90977c6eca
|
|
248
248
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
249
249
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
250
250
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.12" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.12"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -35,7 +35,6 @@ excel = ["dep:calamine", "dep:polars", "tokio-runtime"]
|
|
|
35
35
|
office = [
|
|
36
36
|
"dep:roxmltree",
|
|
37
37
|
"dep:zip",
|
|
38
|
-
"dep:docx-lite",
|
|
39
38
|
"dep:quick-xml",
|
|
40
39
|
"dep:pulldown-cmark",
|
|
41
40
|
"dep:biblatex",
|
|
@@ -151,10 +150,10 @@ pdfium-render = { package = "kreuzberg-pdfium-render", version = "0.9.0", featur
|
|
|
151
150
|
"image_latest",
|
|
152
151
|
], optional = true }
|
|
153
152
|
lopdf = { version = "0.39.0", optional = true }
|
|
154
|
-
calamine = { version = "0.
|
|
153
|
+
calamine = { version = "0.33.0", features = ["dates"], optional = true }
|
|
155
154
|
polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
|
|
156
155
|
roxmltree = { version = "0.21.1", optional = true }
|
|
157
|
-
zip = { version = "7.
|
|
156
|
+
zip = { version = "7.4.0", optional = true }
|
|
158
157
|
mail-parser = { version = "0.11.1", optional = true }
|
|
159
158
|
msg_parser = { version = "0.1.1", optional = true }
|
|
160
159
|
html-to-markdown-rs = { workspace = true, features = [
|
|
@@ -165,7 +164,6 @@ quick-xml = { version = "0.39.0", features = ["serialize"], optional = true }
|
|
|
165
164
|
tar = { version = "0.4.44", optional = true }
|
|
166
165
|
sevenz-rust2 = { version = "0.20.1", optional = true }
|
|
167
166
|
lzma-rust2 = { workspace = true, optional = true }
|
|
168
|
-
docx-lite = { version = "0.2.0", optional = true }
|
|
169
167
|
|
|
170
168
|
pulldown-cmark = { version = "0.13", optional = true }
|
|
171
169
|
biblatex = { version = "0.11", optional = true }
|
|
@@ -218,7 +216,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
|
|
|
218
216
|
tempfile = { workspace = true }
|
|
219
217
|
filetime = "0.2"
|
|
220
218
|
tar = "0.4.44"
|
|
221
|
-
zip = "7.
|
|
219
|
+
zip = "7.4.0"
|
|
222
220
|
serial_test = "3.3.1"
|
|
223
221
|
anyhow = { workspace = true }
|
|
224
222
|
tokio-test = "0.4"
|
|
@@ -239,7 +237,7 @@ fastembed = { version = "5.8", default-features = false, features = [
|
|
|
239
237
|
"ort-load-dynamic",
|
|
240
238
|
], optional = true }
|
|
241
239
|
# Force ureq (transitive dep via hf-hub) to use rustls on non-Windows
|
|
242
|
-
ureq = { version = "3.
|
|
240
|
+
ureq = { version = "3.2", default-features = false, features = ["rustls", "json"] }
|
|
243
241
|
|
|
244
242
|
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
|
245
243
|
[target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies]
|
|
@@ -253,7 +251,7 @@ fastembed = { version = "5.8", default-features = false, features = [
|
|
|
253
251
|
"ort-load-dynamic",
|
|
254
252
|
], optional = true }
|
|
255
253
|
# Force ureq (transitive dep via hf-hub) to use native-tls on Windows
|
|
256
|
-
ureq = { version = "3.
|
|
254
|
+
ureq = { version = "3.2", default-features = false, features = ["native-tls", "json"] }
|
|
257
255
|
|
|
258
256
|
[target.'cfg(target_arch = "wasm32")'.dependencies]
|
|
259
257
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.12 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -9,6 +9,7 @@ use crate::{KreuzbergError, Result};
|
|
|
9
9
|
use std::borrow::Cow;
|
|
10
10
|
use std::path::Path;
|
|
11
11
|
use std::sync::Arc;
|
|
12
|
+
use std::time::Instant;
|
|
12
13
|
|
|
13
14
|
use super::bytes::extract_bytes;
|
|
14
15
|
use super::file::extract_file;
|
|
@@ -82,10 +83,18 @@ pub async fn batch_extract_file(
|
|
|
82
83
|
|
|
83
84
|
tasks.spawn(async move {
|
|
84
85
|
let _permit = semaphore_clone.acquire().await.unwrap();
|
|
85
|
-
let
|
|
86
|
+
let start = Instant::now();
|
|
87
|
+
let mut result =
|
|
86
88
|
crate::core::batch_mode::with_batch_mode(async { extract_file(&path_buf, None, &config_clone).await })
|
|
87
89
|
.await;
|
|
88
|
-
(
|
|
90
|
+
let elapsed_ms = start.elapsed().as_millis() as u64;
|
|
91
|
+
|
|
92
|
+
// Add extraction timing to result metadata for benchmarking
|
|
93
|
+
if let Ok(ref mut r) = result {
|
|
94
|
+
r.metadata.extraction_duration_ms = Some(elapsed_ms);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
(index, result, elapsed_ms)
|
|
89
98
|
});
|
|
90
99
|
}
|
|
91
100
|
|
|
@@ -93,10 +102,11 @@ pub async fn batch_extract_file(
|
|
|
93
102
|
|
|
94
103
|
while let Some(task_result) = tasks.join_next().await {
|
|
95
104
|
match task_result {
|
|
96
|
-
Ok((index, Ok(result))) => {
|
|
105
|
+
Ok((index, Ok(result), _elapsed_ms)) => {
|
|
106
|
+
// Timing already added to result.metadata.extraction_duration_ms
|
|
97
107
|
results[index] = Some(result);
|
|
98
108
|
}
|
|
99
|
-
Ok((index, Err(e))) => {
|
|
109
|
+
Ok((index, Err(e), elapsed_ms)) => {
|
|
100
110
|
// All errors (including Io) should create error results
|
|
101
111
|
// instead of causing early return that abandons running tasks
|
|
102
112
|
let metadata = Metadata {
|
|
@@ -104,6 +114,7 @@ pub async fn batch_extract_file(
|
|
|
104
114
|
error_type: format!("{:?}", e),
|
|
105
115
|
message: e.to_string(),
|
|
106
116
|
}),
|
|
117
|
+
extraction_duration_ms: Some(elapsed_ms),
|
|
107
118
|
..Default::default()
|
|
108
119
|
};
|
|
109
120
|
|
|
@@ -196,11 +207,19 @@ pub async fn batch_extract_bytes(
|
|
|
196
207
|
|
|
197
208
|
tasks.spawn(async move {
|
|
198
209
|
let _permit = semaphore_clone.acquire().await.unwrap();
|
|
199
|
-
let
|
|
210
|
+
let start = Instant::now();
|
|
211
|
+
let mut result = crate::core::batch_mode::with_batch_mode(async {
|
|
200
212
|
extract_bytes(&bytes, &mime_type, &config_clone).await
|
|
201
213
|
})
|
|
202
214
|
.await;
|
|
203
|
-
(
|
|
215
|
+
let elapsed_ms = start.elapsed().as_millis() as u64;
|
|
216
|
+
|
|
217
|
+
// Add extraction timing to result metadata for benchmarking
|
|
218
|
+
if let Ok(ref mut r) = result {
|
|
219
|
+
r.metadata.extraction_duration_ms = Some(elapsed_ms);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
(index, result, elapsed_ms)
|
|
204
223
|
});
|
|
205
224
|
}
|
|
206
225
|
|
|
@@ -208,10 +227,11 @@ pub async fn batch_extract_bytes(
|
|
|
208
227
|
|
|
209
228
|
while let Some(task_result) = tasks.join_next().await {
|
|
210
229
|
match task_result {
|
|
211
|
-
Ok((index, Ok(result))) => {
|
|
230
|
+
Ok((index, Ok(result), _elapsed_ms)) => {
|
|
231
|
+
// Timing already added to result.metadata.extraction_duration_ms
|
|
212
232
|
results[index] = Some(result);
|
|
213
233
|
}
|
|
214
|
-
Ok((index, Err(e))) => {
|
|
234
|
+
Ok((index, Err(e), elapsed_ms)) => {
|
|
215
235
|
// All errors (including Io) should create error results
|
|
216
236
|
// instead of causing early return that abandons running tasks
|
|
217
237
|
let metadata = Metadata {
|
|
@@ -219,6 +239,7 @@ pub async fn batch_extract_bytes(
|
|
|
219
239
|
error_type: format!("{:?}", e),
|
|
220
240
|
message: e.to_string(),
|
|
221
241
|
}),
|
|
242
|
+
extraction_duration_ms: Some(elapsed_ms),
|
|
222
243
|
..Default::default()
|
|
223
244
|
};
|
|
224
245
|
|
|
@@ -1,31 +1,21 @@
|
|
|
1
|
-
//! DOCX (Microsoft Word) text extraction
|
|
1
|
+
//! DOCX (Microsoft Word) text extraction.
|
|
2
2
|
//!
|
|
3
|
-
//! This module provides high-performance text extraction from DOCX files using
|
|
4
|
-
//!
|
|
3
|
+
//! This module provides high-performance text extraction from DOCX files using
|
|
4
|
+
//! streaming XML parsing for efficiency.
|
|
5
5
|
//!
|
|
6
6
|
//! Page break detection is best-effort, detecting only explicit page breaks (`<w:br w:type="page"/>`)
|
|
7
7
|
//! in the document XML. This does not account for automatic pagination based on content reflowing.
|
|
8
8
|
|
|
9
|
+
pub mod parser;
|
|
10
|
+
|
|
9
11
|
use crate::error::{KreuzbergError, Result};
|
|
10
12
|
use crate::extraction::capacity;
|
|
11
13
|
use crate::types::PageBoundary;
|
|
12
14
|
use std::io::Cursor;
|
|
13
15
|
|
|
14
|
-
/// Extract text from DOCX bytes
|
|
15
|
-
///
|
|
16
|
-
/// # Arguments
|
|
17
|
-
/// * `bytes` - The DOCX file contents as bytes
|
|
18
|
-
///
|
|
19
|
-
/// # Returns
|
|
20
|
-
/// * `Ok(String)` - The extracted text content
|
|
21
|
-
/// * `Err(KreuzbergError)` - If extraction fails
|
|
22
|
-
///
|
|
23
|
-
/// # Performance
|
|
24
|
-
/// docx-lite uses streaming XML parsing for minimal memory overhead and high throughput
|
|
25
|
-
/// (~160 MB/s average).
|
|
16
|
+
/// Extract text from DOCX bytes.
|
|
26
17
|
pub fn extract_text(bytes: &[u8]) -> Result<String> {
|
|
27
|
-
|
|
28
|
-
.map_err(|e| KreuzbergError::parsing(format!("DOCX text extraction failed: {}", e)))
|
|
18
|
+
parser::extract_text_from_bytes(bytes)
|
|
29
19
|
}
|
|
30
20
|
|
|
31
21
|
/// Extract text and page boundaries from DOCX bytes.
|
|
@@ -0,0 +1,686 @@
|
|
|
1
|
+
//! Inline DOCX XML parser.
|
|
2
|
+
//!
|
|
3
|
+
//! Vendored and adapted from [docx-lite](https://github.com/v-lawyer/docx-lite) v0.2.0
|
|
4
|
+
//! (MIT OR Apache-2.0, V-Lawyer Team). See ATTRIBUTIONS.md for details.
|
|
5
|
+
//!
|
|
6
|
+
//! Changes from upstream:
|
|
7
|
+
//! - `Paragraph::to_text()` joins runs with `" "` instead of `""` (fixes #359)
|
|
8
|
+
//! - Adapted to use kreuzberg's existing `quick-xml` and `zip` versions
|
|
9
|
+
//! - Removed file-path based APIs (we only need bytes/reader)
|
|
10
|
+
|
|
11
|
+
use std::collections::HashMap;
|
|
12
|
+
use std::io::{Cursor, Read, Seek};
|
|
13
|
+
|
|
14
|
+
use quick_xml::Reader;
|
|
15
|
+
use quick_xml::events::Event;
|
|
16
|
+
|
|
17
|
+
// --- Types ---
|
|
18
|
+
|
|
19
|
+
#[derive(Debug, Clone, Default)]
|
|
20
|
+
pub struct Document {
|
|
21
|
+
pub paragraphs: Vec<Paragraph>,
|
|
22
|
+
pub tables: Vec<Table>,
|
|
23
|
+
pub lists: Vec<ListItem>,
|
|
24
|
+
pub headers: Vec<HeaderFooter>,
|
|
25
|
+
pub footers: Vec<HeaderFooter>,
|
|
26
|
+
pub footnotes: Vec<Note>,
|
|
27
|
+
pub endnotes: Vec<Note>,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
#[derive(Debug, Clone, Default)]
|
|
31
|
+
pub struct Paragraph {
|
|
32
|
+
pub runs: Vec<Run>,
|
|
33
|
+
pub style: Option<String>,
|
|
34
|
+
pub numbering_id: Option<i64>,
|
|
35
|
+
pub numbering_level: Option<i64>,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
#[derive(Debug, Clone, Default)]
|
|
39
|
+
pub struct Run {
|
|
40
|
+
pub text: String,
|
|
41
|
+
pub bold: bool,
|
|
42
|
+
pub italic: bool,
|
|
43
|
+
pub underline: bool,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#[derive(Debug, Clone, Default)]
|
|
47
|
+
pub struct Table {
|
|
48
|
+
pub rows: Vec<TableRow>,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
#[derive(Debug, Clone, Default)]
|
|
52
|
+
pub struct TableRow {
|
|
53
|
+
pub cells: Vec<TableCell>,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#[derive(Debug, Clone, Default)]
|
|
57
|
+
pub struct TableCell {
|
|
58
|
+
pub paragraphs: Vec<Paragraph>,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
#[derive(Debug, Clone)]
|
|
62
|
+
pub struct ListItem {
|
|
63
|
+
pub level: u32,
|
|
64
|
+
pub list_type: ListType,
|
|
65
|
+
pub number: Option<String>,
|
|
66
|
+
pub text: String,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
#[derive(Debug, Clone, PartialEq)]
|
|
70
|
+
pub enum ListType {
|
|
71
|
+
Bullet,
|
|
72
|
+
Numbered,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
#[derive(Debug, Clone, Default)]
|
|
76
|
+
pub struct HeaderFooter {
|
|
77
|
+
pub paragraphs: Vec<Paragraph>,
|
|
78
|
+
pub tables: Vec<Table>,
|
|
79
|
+
pub header_type: HeaderFooterType,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
#[derive(Debug, Clone, Default, PartialEq)]
|
|
83
|
+
pub enum HeaderFooterType {
|
|
84
|
+
#[default]
|
|
85
|
+
Default,
|
|
86
|
+
First,
|
|
87
|
+
Even,
|
|
88
|
+
Odd,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
#[derive(Debug, Clone)]
|
|
92
|
+
pub struct Note {
|
|
93
|
+
pub id: String,
|
|
94
|
+
pub note_type: NoteType,
|
|
95
|
+
pub paragraphs: Vec<Paragraph>,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
#[derive(Debug, Clone, PartialEq)]
|
|
99
|
+
pub enum NoteType {
|
|
100
|
+
Footnote,
|
|
101
|
+
Endnote,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// --- Impls ---
|
|
105
|
+
|
|
106
|
+
impl Document {
|
|
107
|
+
pub fn new() -> Self {
|
|
108
|
+
Self::default()
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
pub fn extract_text(&self) -> String {
|
|
112
|
+
let mut text = String::new();
|
|
113
|
+
|
|
114
|
+
let mut list_index = 0;
|
|
115
|
+
for paragraph in &self.paragraphs {
|
|
116
|
+
if let (Some(_num_id), Some(_level)) = (paragraph.numbering_id, paragraph.numbering_level) {
|
|
117
|
+
let para_text = paragraph.to_text();
|
|
118
|
+
if !para_text.is_empty() {
|
|
119
|
+
text.push_str(¶_text);
|
|
120
|
+
text.push('\n');
|
|
121
|
+
}
|
|
122
|
+
list_index += 1;
|
|
123
|
+
let _ = list_index; // suppress unused warning
|
|
124
|
+
} else {
|
|
125
|
+
let para_text = paragraph.to_text();
|
|
126
|
+
if !para_text.is_empty() {
|
|
127
|
+
text.push_str(¶_text);
|
|
128
|
+
text.push('\n');
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
for table in &self.tables {
|
|
134
|
+
for row in &table.rows {
|
|
135
|
+
for cell in &row.cells {
|
|
136
|
+
for paragraph in &cell.paragraphs {
|
|
137
|
+
let para_text = paragraph.to_text();
|
|
138
|
+
if !para_text.is_empty() {
|
|
139
|
+
text.push_str(¶_text);
|
|
140
|
+
text.push('\t');
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
text.push('\n');
|
|
145
|
+
}
|
|
146
|
+
text.push('\n');
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
text
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
impl Paragraph {
|
|
154
|
+
pub fn new() -> Self {
|
|
155
|
+
Self::default()
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/// Concatenate text runs to produce paragraph text.
|
|
159
|
+
///
|
|
160
|
+
/// In DOCX, whitespace between words is stored inside `<w:t>` elements
|
|
161
|
+
/// (e.g. `<w:t>Hello </w:t><w:t>World</w:t>`), so runs are joined
|
|
162
|
+
/// directly without adding extra separators. The parser must use
|
|
163
|
+
/// `trim_text(false)` to preserve this whitespace.
|
|
164
|
+
pub fn to_text(&self) -> String {
|
|
165
|
+
let mut text = String::new();
|
|
166
|
+
for run in &self.runs {
|
|
167
|
+
text.push_str(&run.text);
|
|
168
|
+
}
|
|
169
|
+
text
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
pub fn add_run(&mut self, run: Run) {
|
|
173
|
+
self.runs.push(run);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
impl Run {
|
|
178
|
+
pub fn new(text: String) -> Self {
|
|
179
|
+
Self {
|
|
180
|
+
text,
|
|
181
|
+
..Default::default()
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
impl Table {
|
|
187
|
+
pub fn new() -> Self {
|
|
188
|
+
Self::default()
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
impl HeaderFooter {
|
|
193
|
+
pub fn extract_text(&self) -> String {
|
|
194
|
+
let mut text = String::new();
|
|
195
|
+
|
|
196
|
+
for paragraph in &self.paragraphs {
|
|
197
|
+
let para_text = paragraph.to_text();
|
|
198
|
+
if !para_text.is_empty() {
|
|
199
|
+
text.push_str(¶_text);
|
|
200
|
+
text.push('\n');
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
for table in &self.tables {
|
|
205
|
+
for row in &table.rows {
|
|
206
|
+
for cell in &row.cells {
|
|
207
|
+
for paragraph in &cell.paragraphs {
|
|
208
|
+
let para_text = paragraph.to_text();
|
|
209
|
+
if !para_text.is_empty() {
|
|
210
|
+
text.push_str(¶_text);
|
|
211
|
+
text.push('\t');
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
text.push('\n');
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
text
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// --- Parser ---
|
|
224
|
+
|
|
225
|
+
struct DocxParser<R: Read + Seek> {
|
|
226
|
+
archive: zip::ZipArchive<R>,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
impl<R: Read + Seek> DocxParser<R> {
|
|
230
|
+
fn new(reader: R) -> Result<Self, DocxParseError> {
|
|
231
|
+
let archive = zip::ZipArchive::new(reader)?;
|
|
232
|
+
Ok(Self { archive })
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
fn parse(mut self) -> Result<Document, DocxParseError> {
|
|
236
|
+
let mut document = Document::new();
|
|
237
|
+
|
|
238
|
+
let document_xml = self.read_file("word/document.xml")?;
|
|
239
|
+
self.parse_document_xml(&document_xml, &mut document)?;
|
|
240
|
+
|
|
241
|
+
if let Ok(numbering_xml) = self.read_file("word/numbering.xml") {
|
|
242
|
+
let numbering_defs = self.parse_numbering(&numbering_xml)?;
|
|
243
|
+
self.process_lists(&mut document, &numbering_defs);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
self.parse_headers_footers(&mut document)?;
|
|
247
|
+
|
|
248
|
+
if let Ok(footnotes_xml) = self.read_file("word/footnotes.xml") {
|
|
249
|
+
self.parse_notes(&footnotes_xml, &mut document.footnotes, NoteType::Footnote)?;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if let Ok(endnotes_xml) = self.read_file("word/endnotes.xml") {
|
|
253
|
+
self.parse_notes(&endnotes_xml, &mut document.endnotes, NoteType::Endnote)?;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
Ok(document)
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
fn read_file(&mut self, path: &str) -> Result<String, DocxParseError> {
|
|
260
|
+
let mut file = self
|
|
261
|
+
.archive
|
|
262
|
+
.by_name(path)
|
|
263
|
+
.map_err(|_| DocxParseError::FileNotFound(path.to_string()))?;
|
|
264
|
+
|
|
265
|
+
let mut contents = String::new();
|
|
266
|
+
file.read_to_string(&mut contents)?;
|
|
267
|
+
Ok(contents)
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
fn parse_document_xml(&self, xml: &str, document: &mut Document) -> Result<(), DocxParseError> {
|
|
271
|
+
let mut reader = Reader::from_str(xml);
|
|
272
|
+
reader.config_mut().trim_text(false);
|
|
273
|
+
|
|
274
|
+
let mut buf = Vec::new();
|
|
275
|
+
let mut current_paragraph: Option<Paragraph> = None;
|
|
276
|
+
let mut current_run: Option<Run> = None;
|
|
277
|
+
let mut current_table: Option<Table> = None;
|
|
278
|
+
let mut current_row: Option<TableRow> = None;
|
|
279
|
+
let mut current_cell: Option<TableCell> = None;
|
|
280
|
+
let mut in_text = false;
|
|
281
|
+
let mut in_table = false;
|
|
282
|
+
|
|
283
|
+
loop {
|
|
284
|
+
match reader.read_event_into(&mut buf) {
|
|
285
|
+
Ok(Event::Start(ref e)) => match e.name().as_ref() {
|
|
286
|
+
b"w:p" => {
|
|
287
|
+
if in_table {
|
|
288
|
+
if current_cell.is_none() {
|
|
289
|
+
current_cell = Some(TableCell::default());
|
|
290
|
+
}
|
|
291
|
+
} else {
|
|
292
|
+
current_paragraph = Some(Paragraph::new());
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
b"w:numPr" => {
|
|
296
|
+
if let Some(ref mut para) = current_paragraph {
|
|
297
|
+
para.numbering_id = Some(1);
|
|
298
|
+
para.numbering_level = Some(0);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
b"w:r" => {
|
|
302
|
+
current_run = Some(Run::default());
|
|
303
|
+
}
|
|
304
|
+
b"w:t" => {
|
|
305
|
+
in_text = true;
|
|
306
|
+
}
|
|
307
|
+
b"w:tbl" => {
|
|
308
|
+
in_table = true;
|
|
309
|
+
current_table = Some(Table::new());
|
|
310
|
+
}
|
|
311
|
+
b"w:tr" => {
|
|
312
|
+
current_row = Some(TableRow::default());
|
|
313
|
+
}
|
|
314
|
+
b"w:tc" => {
|
|
315
|
+
current_cell = Some(TableCell::default());
|
|
316
|
+
}
|
|
317
|
+
b"w:b" => {
|
|
318
|
+
if let Some(ref mut run) = current_run {
|
|
319
|
+
run.bold = true;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
b"w:i" => {
|
|
323
|
+
if let Some(ref mut run) = current_run {
|
|
324
|
+
run.italic = true;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
b"w:u" => {
|
|
328
|
+
if let Some(ref mut run) = current_run {
|
|
329
|
+
run.underline = true;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
_ => {}
|
|
333
|
+
},
|
|
334
|
+
Ok(Event::Text(e)) => {
|
|
335
|
+
if in_text {
|
|
336
|
+
if let Some(ref mut run) = current_run {
|
|
337
|
+
let text = e.decode()?.into_owned();
|
|
338
|
+
run.text.push_str(&text);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
Ok(Event::End(ref e)) => match e.name().as_ref() {
|
|
343
|
+
b"w:t" => {
|
|
344
|
+
in_text = false;
|
|
345
|
+
}
|
|
346
|
+
b"w:r" => {
|
|
347
|
+
if let Some(run) = current_run.take() {
|
|
348
|
+
if in_table {
|
|
349
|
+
if let Some(ref mut cell) = current_cell {
|
|
350
|
+
if cell.paragraphs.is_empty() {
|
|
351
|
+
cell.paragraphs.push(Paragraph::new());
|
|
352
|
+
}
|
|
353
|
+
if let Some(para) = cell.paragraphs.last_mut() {
|
|
354
|
+
para.add_run(run);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
} else if let Some(ref mut para) = current_paragraph {
|
|
358
|
+
para.add_run(run);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
b"w:p" => {
|
|
363
|
+
if in_table {
|
|
364
|
+
// handled via cell
|
|
365
|
+
} else if let Some(para) = current_paragraph.take() {
|
|
366
|
+
document.paragraphs.push(para);
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
b"w:tc" => {
|
|
370
|
+
if let Some(cell) = current_cell.take() {
|
|
371
|
+
if let Some(ref mut row) = current_row {
|
|
372
|
+
row.cells.push(cell);
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
b"w:tr" => {
|
|
377
|
+
if let Some(row) = current_row.take() {
|
|
378
|
+
if let Some(ref mut table) = current_table {
|
|
379
|
+
table.rows.push(row);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
b"w:tbl" => {
|
|
384
|
+
in_table = false;
|
|
385
|
+
if let Some(table) = current_table.take() {
|
|
386
|
+
document.tables.push(table);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
_ => {}
|
|
390
|
+
},
|
|
391
|
+
Ok(Event::Eof) => break,
|
|
392
|
+
Err(e) => return Err(e.into()),
|
|
393
|
+
_ => {}
|
|
394
|
+
}
|
|
395
|
+
buf.clear();
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
Ok(())
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
fn parse_numbering(&self, xml: &str) -> Result<HashMap<i64, ListType>, DocxParseError> {
|
|
402
|
+
let mut numbering_defs = HashMap::new();
|
|
403
|
+
let mut reader = Reader::from_str(xml);
|
|
404
|
+
reader.config_mut().trim_text(false);
|
|
405
|
+
|
|
406
|
+
let mut buf = Vec::new();
|
|
407
|
+
let mut current_num_id: Option<i64> = None;
|
|
408
|
+
|
|
409
|
+
loop {
|
|
410
|
+
match reader.read_event_into(&mut buf) {
|
|
411
|
+
Ok(Event::Start(ref e)) => {
|
|
412
|
+
if e.name().as_ref() == b"w:num" {
|
|
413
|
+
for attr in e.attributes().flatten() {
|
|
414
|
+
if attr.key.as_ref() == b"w:numId" {
|
|
415
|
+
if let Ok(id_str) = std::str::from_utf8(&attr.value) {
|
|
416
|
+
current_num_id = id_str.parse().ok();
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
Ok(Event::End(ref e)) => {
|
|
423
|
+
if e.name().as_ref() == b"w:num" {
|
|
424
|
+
if let Some(id) = current_num_id {
|
|
425
|
+
numbering_defs.insert(id, ListType::Bullet);
|
|
426
|
+
}
|
|
427
|
+
current_num_id = None;
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
Ok(Event::Eof) => break,
|
|
431
|
+
_ => {}
|
|
432
|
+
}
|
|
433
|
+
buf.clear();
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
Ok(numbering_defs)
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
fn process_lists(&self, document: &mut Document, numbering_defs: &HashMap<i64, ListType>) {
|
|
440
|
+
for paragraph in &document.paragraphs {
|
|
441
|
+
if let (Some(num_id), Some(level)) = (paragraph.numbering_id, paragraph.numbering_level) {
|
|
442
|
+
let list_type = numbering_defs.get(&num_id).cloned().unwrap_or(ListType::Bullet);
|
|
443
|
+
|
|
444
|
+
let list_item = ListItem {
|
|
445
|
+
level: level as u32,
|
|
446
|
+
list_type,
|
|
447
|
+
number: None,
|
|
448
|
+
text: paragraph.to_text(),
|
|
449
|
+
};
|
|
450
|
+
|
|
451
|
+
document.lists.push(list_item);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
fn parse_headers_footers(&mut self, document: &mut Document) -> Result<(), DocxParseError> {
|
|
457
|
+
for i in 1..=3 {
|
|
458
|
+
let header_path = format!("word/header{}.xml", i);
|
|
459
|
+
if let Ok(header_xml) = self.read_file(&header_path) {
|
|
460
|
+
let mut header = HeaderFooter::default();
|
|
461
|
+
self.parse_header_footer_content(&header_xml, &mut header)?;
|
|
462
|
+
document.headers.push(header);
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
let footer_path = format!("word/footer{}.xml", i);
|
|
466
|
+
if let Ok(footer_xml) = self.read_file(&footer_path) {
|
|
467
|
+
let mut footer = HeaderFooter::default();
|
|
468
|
+
self.parse_header_footer_content(&footer_xml, &mut footer)?;
|
|
469
|
+
document.footers.push(footer);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
Ok(())
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
fn parse_header_footer_content(&self, xml: &str, header_footer: &mut HeaderFooter) -> Result<(), DocxParseError> {
|
|
477
|
+
let mut reader = Reader::from_str(xml);
|
|
478
|
+
reader.config_mut().trim_text(false);
|
|
479
|
+
|
|
480
|
+
let mut buf = Vec::new();
|
|
481
|
+
let mut current_paragraph: Option<Paragraph> = None;
|
|
482
|
+
let mut current_run: Option<Run> = None;
|
|
483
|
+
let mut in_text = false;
|
|
484
|
+
|
|
485
|
+
loop {
|
|
486
|
+
match reader.read_event_into(&mut buf) {
|
|
487
|
+
Ok(Event::Start(ref e)) => match e.name().as_ref() {
|
|
488
|
+
b"w:p" => current_paragraph = Some(Paragraph::new()),
|
|
489
|
+
b"w:r" => current_run = Some(Run::default()),
|
|
490
|
+
b"w:t" => in_text = true,
|
|
491
|
+
_ => {}
|
|
492
|
+
},
|
|
493
|
+
Ok(Event::Text(e)) => {
|
|
494
|
+
if in_text {
|
|
495
|
+
if let Some(ref mut run) = current_run {
|
|
496
|
+
let text = e.decode()?.into_owned();
|
|
497
|
+
run.text.push_str(&text);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
Ok(Event::End(ref e)) => match e.name().as_ref() {
|
|
502
|
+
b"w:t" => in_text = false,
|
|
503
|
+
b"w:r" => {
|
|
504
|
+
if let Some(run) = current_run.take() {
|
|
505
|
+
if let Some(ref mut para) = current_paragraph {
|
|
506
|
+
para.add_run(run);
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
b"w:p" => {
|
|
511
|
+
if let Some(para) = current_paragraph.take() {
|
|
512
|
+
header_footer.paragraphs.push(para);
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
_ => {}
|
|
516
|
+
},
|
|
517
|
+
Ok(Event::Eof) => break,
|
|
518
|
+
_ => {}
|
|
519
|
+
}
|
|
520
|
+
buf.clear();
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
Ok(())
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
fn parse_notes(&self, xml: &str, notes: &mut Vec<Note>, note_type: NoteType) -> Result<(), DocxParseError> {
|
|
527
|
+
let mut reader = Reader::from_str(xml);
|
|
528
|
+
reader.config_mut().trim_text(false);
|
|
529
|
+
|
|
530
|
+
let mut buf = Vec::new();
|
|
531
|
+
let mut current_note: Option<Note> = None;
|
|
532
|
+
let mut current_paragraph: Option<Paragraph> = None;
|
|
533
|
+
let mut current_run: Option<Run> = None;
|
|
534
|
+
let mut in_text = false;
|
|
535
|
+
|
|
536
|
+
loop {
|
|
537
|
+
match reader.read_event_into(&mut buf) {
|
|
538
|
+
Ok(Event::Start(ref e)) => match e.name().as_ref() {
|
|
539
|
+
b"w:footnote" | b"w:endnote" => {
|
|
540
|
+
let mut id = String::new();
|
|
541
|
+
for attr in e.attributes().flatten() {
|
|
542
|
+
if attr.key.as_ref() == b"w:id" {
|
|
543
|
+
id = String::from_utf8_lossy(&attr.value).to_string();
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
current_note = Some(Note {
|
|
547
|
+
id,
|
|
548
|
+
note_type: note_type.clone(),
|
|
549
|
+
paragraphs: Vec::new(),
|
|
550
|
+
});
|
|
551
|
+
}
|
|
552
|
+
b"w:p" => current_paragraph = Some(Paragraph::new()),
|
|
553
|
+
b"w:r" => current_run = Some(Run::default()),
|
|
554
|
+
b"w:t" => in_text = true,
|
|
555
|
+
_ => {}
|
|
556
|
+
},
|
|
557
|
+
Ok(Event::Text(e)) => {
|
|
558
|
+
if in_text {
|
|
559
|
+
if let Some(ref mut run) = current_run {
|
|
560
|
+
let text = e.decode()?.into_owned();
|
|
561
|
+
run.text.push_str(&text);
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
Ok(Event::End(ref e)) => match e.name().as_ref() {
|
|
566
|
+
b"w:t" => in_text = false,
|
|
567
|
+
b"w:r" => {
|
|
568
|
+
if let Some(run) = current_run.take() {
|
|
569
|
+
if let Some(ref mut para) = current_paragraph {
|
|
570
|
+
para.add_run(run);
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
b"w:p" => {
|
|
575
|
+
if let Some(para) = current_paragraph.take() {
|
|
576
|
+
if let Some(ref mut note) = current_note {
|
|
577
|
+
note.paragraphs.push(para);
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
b"w:footnote" | b"w:endnote" => {
|
|
582
|
+
if let Some(note) = current_note.take() {
|
|
583
|
+
if note.id != "-1" && note.id != "0" {
|
|
584
|
+
notes.push(note);
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
_ => {}
|
|
589
|
+
},
|
|
590
|
+
Ok(Event::Eof) => break,
|
|
591
|
+
_ => {}
|
|
592
|
+
}
|
|
593
|
+
buf.clear();
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
Ok(())
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// --- Error ---
|
|
601
|
+
|
|
602
|
+
#[derive(Debug, thiserror::Error)]
|
|
603
|
+
enum DocxParseError {
|
|
604
|
+
#[error("IO error: {0}")]
|
|
605
|
+
Io(#[from] std::io::Error),
|
|
606
|
+
|
|
607
|
+
#[error("ZIP error: {0}")]
|
|
608
|
+
Zip(#[from] zip::result::ZipError),
|
|
609
|
+
|
|
610
|
+
#[error("XML parsing error: {0}")]
|
|
611
|
+
Xml(#[from] quick_xml::Error),
|
|
612
|
+
|
|
613
|
+
#[error("Required file not found in DOCX: {0}")]
|
|
614
|
+
FileNotFound(String),
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
// quick-xml's unescape returns an encoding error type
|
|
618
|
+
impl From<quick_xml::encoding::EncodingError> for DocxParseError {
|
|
619
|
+
fn from(e: quick_xml::encoding::EncodingError) -> Self {
|
|
620
|
+
DocxParseError::Xml(quick_xml::Error::Encoding(e))
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
// --- Public API ---
|
|
625
|
+
|
|
626
|
+
/// Parse a DOCX document from bytes and return the structured document.
|
|
627
|
+
pub fn parse_document(bytes: &[u8]) -> crate::error::Result<Document> {
|
|
628
|
+
let cursor = Cursor::new(bytes);
|
|
629
|
+
let parser = DocxParser::new(cursor)
|
|
630
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
|
|
631
|
+
parser
|
|
632
|
+
.parse()
|
|
633
|
+
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
/// Extract text from DOCX bytes.
|
|
637
|
+
pub fn extract_text_from_bytes(bytes: &[u8]) -> crate::error::Result<String> {
|
|
638
|
+
let doc = parse_document(bytes)?;
|
|
639
|
+
Ok(doc.extract_text())
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
#[cfg(test)]
|
|
643
|
+
mod tests {
|
|
644
|
+
use super::*;
|
|
645
|
+
|
|
646
|
+
/// Runs are concatenated directly; whitespace comes from the XML text content.
|
|
647
|
+
#[test]
|
|
648
|
+
fn test_paragraph_to_text_concatenates_runs() {
|
|
649
|
+
let mut para = Paragraph::new();
|
|
650
|
+
para.add_run(Run::new("Hello ".to_string()));
|
|
651
|
+
para.add_run(Run::new("World".to_string()));
|
|
652
|
+
assert_eq!(para.to_text(), "Hello World");
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
/// Mid-word run splits (e.g. drop caps) must not insert extra spaces.
|
|
656
|
+
#[test]
|
|
657
|
+
fn test_paragraph_to_text_mid_word_split() {
|
|
658
|
+
let mut para = Paragraph::new();
|
|
659
|
+
para.add_run(Run::new("S".to_string()));
|
|
660
|
+
para.add_run(Run::new("ermocination".to_string()));
|
|
661
|
+
assert_eq!(para.to_text(), "Sermocination");
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
#[test]
|
|
665
|
+
fn test_paragraph_to_text_single_run() {
|
|
666
|
+
let mut para = Paragraph::new();
|
|
667
|
+
para.add_run(Run::new("Hello".to_string()));
|
|
668
|
+
assert_eq!(para.to_text(), "Hello");
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
#[test]
|
|
672
|
+
fn test_paragraph_to_text_no_runs() {
|
|
673
|
+
let para = Paragraph::new();
|
|
674
|
+
assert_eq!(para.to_text(), "");
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
/// Whitespace between words is stored in the run text, not added by join.
|
|
678
|
+
#[test]
|
|
679
|
+
fn test_paragraph_to_text_whitespace_in_runs() {
|
|
680
|
+
let mut para = Paragraph::new();
|
|
681
|
+
para.add_run(Run::new("The ".to_string()));
|
|
682
|
+
para.add_run(Run::new("quick ".to_string()));
|
|
683
|
+
para.add_run(Run::new("fox".to_string()));
|
|
684
|
+
assert_eq!(para.to_text(), "The quick fox");
|
|
685
|
+
}
|
|
686
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#![cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
2
2
|
|
|
3
|
-
//! DOCX extractor
|
|
3
|
+
//! DOCX extractor for high-performance text extraction.
|
|
4
4
|
//!
|
|
5
5
|
//! Supports: Microsoft Word (.docx)
|
|
6
6
|
|
|
@@ -14,10 +14,10 @@ use async_trait::async_trait;
|
|
|
14
14
|
use std::borrow::Cow;
|
|
15
15
|
use std::io::Cursor;
|
|
16
16
|
|
|
17
|
-
/// High-performance DOCX extractor
|
|
17
|
+
/// High-performance DOCX extractor.
|
|
18
18
|
///
|
|
19
19
|
/// This extractor provides:
|
|
20
|
-
/// - Fast text extraction via streaming XML parsing
|
|
20
|
+
/// - Fast text extraction via streaming XML parsing
|
|
21
21
|
/// - Comprehensive metadata extraction (core.xml, app.xml, custom.xml)
|
|
22
22
|
pub struct DocxExtractor;
|
|
23
23
|
|
|
@@ -52,7 +52,7 @@ impl Plugin for DocxExtractor {
|
|
|
52
52
|
}
|
|
53
53
|
|
|
54
54
|
fn description(&self) -> &str {
|
|
55
|
-
"High-performance DOCX text extraction
|
|
55
|
+
"High-performance DOCX text extraction with metadata support"
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
fn author(&self) -> &str {
|
|
@@ -60,15 +60,15 @@ impl Plugin for DocxExtractor {
|
|
|
60
60
|
}
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
/// Convert
|
|
63
|
+
/// Convert parsed DOCX table to Kreuzberg Table struct with markdown representation.
|
|
64
64
|
///
|
|
65
65
|
/// # Arguments
|
|
66
|
-
/// * `docx_table` - The
|
|
66
|
+
/// * `docx_table` - The parsed DOCX table
|
|
67
67
|
/// * `table_index` - Index of the table in the document (used as page_number)
|
|
68
68
|
///
|
|
69
69
|
/// # Returns
|
|
70
70
|
/// * `Table` - Converted table with cells and markdown representation
|
|
71
|
-
fn convert_docx_table_to_table(docx_table: &
|
|
71
|
+
fn convert_docx_table_to_table(docx_table: &crate::extraction::docx::parser::Table, table_index: usize) -> Table {
|
|
72
72
|
let cells: Vec<Vec<String>> = docx_table
|
|
73
73
|
.rows
|
|
74
74
|
.iter()
|
|
@@ -97,14 +97,6 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
|
|
|
97
97
|
}
|
|
98
98
|
}
|
|
99
99
|
|
|
100
|
-
/// Convert 2D cell data to markdown table format.
|
|
101
|
-
///
|
|
102
|
-
/// # Arguments
|
|
103
|
-
/// * `cells` - 2D vector of cell strings (rows × columns)
|
|
104
|
-
///
|
|
105
|
-
/// # Returns
|
|
106
|
-
/// * `String` - Markdown formatted table
|
|
107
|
-
|
|
108
100
|
#[async_trait]
|
|
109
101
|
impl DocumentExtractor for DocxExtractor {
|
|
110
102
|
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
@@ -126,9 +118,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
126
118
|
tokio::task::spawn_blocking(
|
|
127
119
|
move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
|
|
128
120
|
let _guard = span.entered();
|
|
129
|
-
let
|
|
130
|
-
let doc = docx_lite::parse_document(cursor)
|
|
131
|
-
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
|
|
121
|
+
let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
|
|
132
122
|
|
|
133
123
|
let text = doc.extract_text();
|
|
134
124
|
|
|
@@ -147,9 +137,7 @@ impl DocumentExtractor for DocxExtractor {
|
|
|
147
137
|
.await
|
|
148
138
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
|
|
149
139
|
} else {
|
|
150
|
-
let
|
|
151
|
-
let doc = docx_lite::parse_document(cursor)
|
|
152
|
-
.map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
|
|
140
|
+
let doc = crate::extraction::docx::parser::parse_document(content)?;
|
|
153
141
|
|
|
154
142
|
let text = doc.extract_text();
|
|
155
143
|
|
|
@@ -373,7 +361,7 @@ mod tests {
|
|
|
373
361
|
|
|
374
362
|
#[test]
|
|
375
363
|
fn test_convert_docx_table_to_table() {
|
|
376
|
-
use
|
|
364
|
+
use crate::extraction::docx::parser::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
|
|
377
365
|
|
|
378
366
|
let mut table = DocxTable::new();
|
|
379
367
|
|
|
@@ -132,6 +132,13 @@ pub struct Metadata {
|
|
|
132
132
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
133
133
|
pub error: Option<ErrorMetadata>,
|
|
134
134
|
|
|
135
|
+
/// Extraction duration in milliseconds (for benchmarking).
|
|
136
|
+
///
|
|
137
|
+
/// This field is populated by batch extraction to provide per-file timing
|
|
138
|
+
/// information. It's `None` for single-file extraction (which uses external timing).
|
|
139
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
140
|
+
pub extraction_duration_ms: Option<u64>,
|
|
141
|
+
|
|
135
142
|
/// Additional custom fields from postprocessors.
|
|
136
143
|
///
|
|
137
144
|
/// This flattened map allows Python/TypeScript postprocessors to add
|
|
@@ -83,6 +83,7 @@ async fn test_embed_empty_texts() {
|
|
|
83
83
|
|
|
84
84
|
/// Test embed endpoint with custom embedding configuration.
|
|
85
85
|
#[tokio::test]
|
|
86
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
86
87
|
async fn test_embed_with_custom_config() {
|
|
87
88
|
let app = create_router(ExtractionConfig::default());
|
|
88
89
|
|
|
@@ -125,6 +126,7 @@ async fn test_embed_with_custom_config() {
|
|
|
125
126
|
|
|
126
127
|
/// Test embed endpoint with single text.
|
|
127
128
|
#[tokio::test]
|
|
129
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
128
130
|
async fn test_embed_single_text() {
|
|
129
131
|
let app = create_router(ExtractionConfig::default());
|
|
130
132
|
|
|
@@ -201,6 +203,7 @@ async fn test_embed_batch() {
|
|
|
201
203
|
|
|
202
204
|
/// Test embed endpoint with long text.
|
|
203
205
|
#[tokio::test]
|
|
206
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
204
207
|
async fn test_embed_long_text() {
|
|
205
208
|
let app = create_router(ExtractionConfig::default());
|
|
206
209
|
|
|
@@ -317,6 +320,7 @@ async fn test_embed_rejects_simple_json_array() {
|
|
|
317
320
|
|
|
318
321
|
/// Test embed endpoint preserves embedding vector values across calls.
|
|
319
322
|
#[tokio::test]
|
|
323
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
320
324
|
async fn test_embed_deterministic() {
|
|
321
325
|
let app = create_router(ExtractionConfig::default());
|
|
322
326
|
|
|
@@ -376,6 +380,7 @@ async fn test_embed_deterministic() {
|
|
|
376
380
|
|
|
377
381
|
/// Test embed endpoint with different embedding presets.
|
|
378
382
|
#[tokio::test]
|
|
383
|
+
#[cfg_attr(target_arch = "aarch64", ignore = "ONNX Runtime model loading unstable on ARM")]
|
|
379
384
|
async fn test_embed_different_presets() {
|
|
380
385
|
let app = create_router(ExtractionConfig::default());
|
|
381
386
|
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
//! Regression test for https://github.com/kreuzberg-dev/kreuzberg/issues/359
|
|
2
|
+
//!
|
|
3
|
+
//! DOCX list items with multiple text runs should preserve whitespace between runs.
|
|
4
|
+
//! e.g. "Sermocination ypsiliform" must not become "Sermocinationypsiliform".
|
|
5
|
+
|
|
6
|
+
#![cfg(feature = "office")]
|
|
7
|
+
|
|
8
|
+
use kreuzberg::{ExtractionConfig, extract_file};
|
|
9
|
+
|
|
10
|
+
#[tokio::test]
|
|
11
|
+
async fn test_issue_359_docx_list_run_whitespace() {
|
|
12
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
13
|
+
.parent()
|
|
14
|
+
.expect("parent")
|
|
15
|
+
.parent()
|
|
16
|
+
.expect("workspace root");
|
|
17
|
+
let test_file = workspace_root.join("test_documents/docx/issue_359_list_whitespace.docx");
|
|
18
|
+
|
|
19
|
+
if !test_file.exists() {
|
|
20
|
+
println!("Skipping test: {:?} not found", test_file);
|
|
21
|
+
return;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
let result = extract_file(&test_file, None, &ExtractionConfig::default())
|
|
25
|
+
.await
|
|
26
|
+
.expect("Should extract DOCX successfully");
|
|
27
|
+
|
|
28
|
+
assert!(
|
|
29
|
+
result.content.contains("Sermocination ypsiliform"),
|
|
30
|
+
"Expected 'Sermocination ypsiliform' with space between runs, got: {:?}",
|
|
31
|
+
result.content
|
|
32
|
+
);
|
|
33
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-tesseract"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.12"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -26,7 +26,7 @@ image = { workspace = true, features = ["png"] }
|
|
|
26
26
|
[build-dependencies]
|
|
27
27
|
cc = { version = "^1.2.55", optional = true }
|
|
28
28
|
cmake = { version = "0.1.57", optional = true }
|
|
29
|
-
zip = { version = "7.
|
|
29
|
+
zip = { version = "7.4.0", optional = true }
|
|
30
30
|
|
|
31
31
|
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
|
32
32
|
[target.'cfg(target_os = "windows")'.build-dependencies]
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.2.
|
|
4
|
+
version: 4.2.12
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-02-
|
|
11
|
+
date: 2026-02-06 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -427,7 +427,8 @@ files:
|
|
|
427
427
|
- vendor/kreuzberg/src/extraction/archive/tar.rs
|
|
428
428
|
- vendor/kreuzberg/src/extraction/archive/zip.rs
|
|
429
429
|
- vendor/kreuzberg/src/extraction/capacity.rs
|
|
430
|
-
- vendor/kreuzberg/src/extraction/docx.rs
|
|
430
|
+
- vendor/kreuzberg/src/extraction/docx/mod.rs
|
|
431
|
+
- vendor/kreuzberg/src/extraction/docx/parser.rs
|
|
431
432
|
- vendor/kreuzberg/src/extraction/email.rs
|
|
432
433
|
- vendor/kreuzberg/src/extraction/excel.rs
|
|
433
434
|
- vendor/kreuzberg/src/extraction/html/converter.rs
|
|
@@ -745,6 +746,7 @@ files:
|
|
|
745
746
|
- vendor/kreuzberg/tests/image_integration.rs
|
|
746
747
|
- vendor/kreuzberg/tests/instrumentation_test.rs
|
|
747
748
|
- vendor/kreuzberg/tests/issue_350_regression_test.rs
|
|
749
|
+
- vendor/kreuzberg/tests/issue_359_list_whitespace_test.rs
|
|
748
750
|
- vendor/kreuzberg/tests/jats_extractor_tests.rs
|
|
749
751
|
- vendor/kreuzberg/tests/jupyter_extractor_tests.rs
|
|
750
752
|
- vendor/kreuzberg/tests/keywords_integration.rs
|