kreuzberg 4.2.11 → 4.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e8a57a9be868cea7df0628ac25c7f0aabea6ced5368b0ec69452abd8ace56cd9
4
- data.tar.gz: dde627c6aff4ae060d53e5b44145bdaf6cfc8af870294f7521c200cf2b7e10ba
3
+ metadata.gz: 869101ec7a3d0814c2baed8606879024e94880ead0003d441a16199d25fd3a16
4
+ data.tar.gz: 9852c4c51345f362095306f40d910c2e1d4ae19f385754d4c8d3a960123f96ee
5
5
  SHA512:
6
- metadata.gz: 1c322dfecd4829e4e3aa13bbdd298f3f06f62877362867cc5795cc0690ef6b632ecbbac515f20c288ae3457ab5a022cb116b4922abc69a11e06151774f6f91f0
7
- data.tar.gz: 50e8a2b5489f169afb9f6b60150954463ed31988b049607c86ac501051df8138e21eec3f04d1e7b8a68f00079078416f7b3a5c2e7d93bf6bfb2bb8313c8e8aa3
6
+ metadata.gz: 579be7645c2f406ce8e7c4cc85ed511edb7c5879bc8674fe9ec4eb4375cf240113968f0ce35747bd10fae52ff5849df4616d8c6db299db609696da24b2700fff
7
+ data.tar.gz: adc5969c0480739fd57a6bdb832db435d64e16ee0b2827197f05fcf4f52a7e7c03affd704b4c1064df49391bb105c4166531b8c4ba96538906b38f863744c843
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.11)
4
+ kreuzberg (4.2.12)
5
5
  rb_sys (~> 0.9.119)
6
6
 
7
7
  GEM
@@ -123,7 +123,7 @@ GEM
123
123
  rubocop (~> 1.81)
124
124
  ruby-progressbar (1.13.0)
125
125
  securerandom (0.4.1)
126
- sorbet-runtime (0.6.12915)
126
+ sorbet-runtime (0.6.12925)
127
127
  steep (1.10.0)
128
128
  activesupport (>= 5.1)
129
129
  concurrent-ruby (>= 1.1.10)
@@ -209,7 +209,7 @@ CHECKSUMS
209
209
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
210
210
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
211
211
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
212
- kreuzberg (4.2.11)
212
+ kreuzberg (4.2.12)
213
213
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
214
214
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
215
215
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -244,7 +244,7 @@ CHECKSUMS
244
244
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
245
245
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
246
246
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
247
- sorbet-runtime (0.6.12915) sha256=21d2866b1edfe57c97d22f36db5bcf2db311f84290e56152e9faf4b4915aa315
247
+ sorbet-runtime (0.6.12925) sha256=ddd6fb1d8aaf6bc19119ffadbc4b96536f3d6766fa82059112dacb90977c6eca
248
248
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
249
249
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
250
250
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.11" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.12" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.2.11"
40
+ version = "4.2.12"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.11'
4
+ VERSION = '4.2.12'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.11"
6
+ version = "4.2.12"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.11"
3
+ version = "4.2.12"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -35,7 +35,6 @@ excel = ["dep:calamine", "dep:polars", "tokio-runtime"]
35
35
  office = [
36
36
  "dep:roxmltree",
37
37
  "dep:zip",
38
- "dep:docx-lite",
39
38
  "dep:quick-xml",
40
39
  "dep:pulldown-cmark",
41
40
  "dep:biblatex",
@@ -154,7 +153,7 @@ lopdf = { version = "0.39.0", optional = true }
154
153
  calamine = { version = "0.33.0", features = ["dates"], optional = true }
155
154
  polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
156
155
  roxmltree = { version = "0.21.1", optional = true }
157
- zip = { version = "7.3.0", optional = true }
156
+ zip = { version = "7.4.0", optional = true }
158
157
  mail-parser = { version = "0.11.1", optional = true }
159
158
  msg_parser = { version = "0.1.1", optional = true }
160
159
  html-to-markdown-rs = { workspace = true, features = [
@@ -165,7 +164,6 @@ quick-xml = { version = "0.39.0", features = ["serialize"], optional = true }
165
164
  tar = { version = "0.4.44", optional = true }
166
165
  sevenz-rust2 = { version = "0.20.1", optional = true }
167
166
  lzma-rust2 = { workspace = true, optional = true }
168
- docx-lite = { version = "0.2.0", optional = true }
169
167
 
170
168
  pulldown-cmark = { version = "0.13", optional = true }
171
169
  biblatex = { version = "0.11", optional = true }
@@ -218,7 +216,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
218
216
  tempfile = { workspace = true }
219
217
  filetime = "0.2"
220
218
  tar = "0.4.44"
221
- zip = "7.3.0"
219
+ zip = "7.4.0"
222
220
  serial_test = "3.3.1"
223
221
  anyhow = { workspace = true }
224
222
  tokio-test = "0.4"
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.11 Release**
20
+ > **🚀 Version 4.2.12 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -1,31 +1,21 @@
1
- //! DOCX (Microsoft Word) text extraction using docx-lite.
1
+ //! DOCX (Microsoft Word) text extraction.
2
2
  //!
3
- //! This module provides high-performance text extraction from DOCX files using the docx-lite
4
- //! library, which uses streaming XML parsing for efficiency.
3
+ //! This module provides high-performance text extraction from DOCX files using
4
+ //! streaming XML parsing for efficiency.
5
5
  //!
6
6
  //! Page break detection is best-effort, detecting only explicit page breaks (`<w:br w:type="page"/>`)
7
7
  //! in the document XML. This does not account for automatic pagination based on content reflowing.
8
8
 
9
+ pub mod parser;
10
+
9
11
  use crate::error::{KreuzbergError, Result};
10
12
  use crate::extraction::capacity;
11
13
  use crate::types::PageBoundary;
12
14
  use std::io::Cursor;
13
15
 
14
- /// Extract text from DOCX bytes using docx-lite.
15
- ///
16
- /// # Arguments
17
- /// * `bytes` - The DOCX file contents as bytes
18
- ///
19
- /// # Returns
20
- /// * `Ok(String)` - The extracted text content
21
- /// * `Err(KreuzbergError)` - If extraction fails
22
- ///
23
- /// # Performance
24
- /// docx-lite uses streaming XML parsing for minimal memory overhead and high throughput
25
- /// (~160 MB/s average).
16
+ /// Extract text from DOCX bytes.
26
17
  pub fn extract_text(bytes: &[u8]) -> Result<String> {
27
- docx_lite::extract_text_from_bytes(bytes)
28
- .map_err(|e| KreuzbergError::parsing(format!("DOCX text extraction failed: {}", e)))
18
+ parser::extract_text_from_bytes(bytes)
29
19
  }
30
20
 
31
21
  /// Extract text and page boundaries from DOCX bytes.
@@ -0,0 +1,686 @@
1
+ //! Inline DOCX XML parser.
2
+ //!
3
+ //! Vendored and adapted from [docx-lite](https://github.com/v-lawyer/docx-lite) v0.2.0
4
+ //! (MIT OR Apache-2.0, V-Lawyer Team). See ATTRIBUTIONS.md for details.
5
+ //!
6
+ //! Changes from upstream:
7
+ //! - `Paragraph::to_text()` joins runs with `" "` instead of `""` (fixes #359)
8
+ //! - Adapted to use kreuzberg's existing `quick-xml` and `zip` versions
9
+ //! - Removed file-path based APIs (we only need bytes/reader)
10
+
11
+ use std::collections::HashMap;
12
+ use std::io::{Cursor, Read, Seek};
13
+
14
+ use quick_xml::Reader;
15
+ use quick_xml::events::Event;
16
+
17
+ // --- Types ---
18
+
19
+ #[derive(Debug, Clone, Default)]
20
+ pub struct Document {
21
+ pub paragraphs: Vec<Paragraph>,
22
+ pub tables: Vec<Table>,
23
+ pub lists: Vec<ListItem>,
24
+ pub headers: Vec<HeaderFooter>,
25
+ pub footers: Vec<HeaderFooter>,
26
+ pub footnotes: Vec<Note>,
27
+ pub endnotes: Vec<Note>,
28
+ }
29
+
30
+ #[derive(Debug, Clone, Default)]
31
+ pub struct Paragraph {
32
+ pub runs: Vec<Run>,
33
+ pub style: Option<String>,
34
+ pub numbering_id: Option<i64>,
35
+ pub numbering_level: Option<i64>,
36
+ }
37
+
38
+ #[derive(Debug, Clone, Default)]
39
+ pub struct Run {
40
+ pub text: String,
41
+ pub bold: bool,
42
+ pub italic: bool,
43
+ pub underline: bool,
44
+ }
45
+
46
+ #[derive(Debug, Clone, Default)]
47
+ pub struct Table {
48
+ pub rows: Vec<TableRow>,
49
+ }
50
+
51
+ #[derive(Debug, Clone, Default)]
52
+ pub struct TableRow {
53
+ pub cells: Vec<TableCell>,
54
+ }
55
+
56
+ #[derive(Debug, Clone, Default)]
57
+ pub struct TableCell {
58
+ pub paragraphs: Vec<Paragraph>,
59
+ }
60
+
61
+ #[derive(Debug, Clone)]
62
+ pub struct ListItem {
63
+ pub level: u32,
64
+ pub list_type: ListType,
65
+ pub number: Option<String>,
66
+ pub text: String,
67
+ }
68
+
69
+ #[derive(Debug, Clone, PartialEq)]
70
+ pub enum ListType {
71
+ Bullet,
72
+ Numbered,
73
+ }
74
+
75
+ #[derive(Debug, Clone, Default)]
76
+ pub struct HeaderFooter {
77
+ pub paragraphs: Vec<Paragraph>,
78
+ pub tables: Vec<Table>,
79
+ pub header_type: HeaderFooterType,
80
+ }
81
+
82
+ #[derive(Debug, Clone, Default, PartialEq)]
83
+ pub enum HeaderFooterType {
84
+ #[default]
85
+ Default,
86
+ First,
87
+ Even,
88
+ Odd,
89
+ }
90
+
91
+ #[derive(Debug, Clone)]
92
+ pub struct Note {
93
+ pub id: String,
94
+ pub note_type: NoteType,
95
+ pub paragraphs: Vec<Paragraph>,
96
+ }
97
+
98
+ #[derive(Debug, Clone, PartialEq)]
99
+ pub enum NoteType {
100
+ Footnote,
101
+ Endnote,
102
+ }
103
+
104
+ // --- Impls ---
105
+
106
+ impl Document {
107
+ pub fn new() -> Self {
108
+ Self::default()
109
+ }
110
+
111
+ pub fn extract_text(&self) -> String {
112
+ let mut text = String::new();
113
+
114
+ let mut list_index = 0;
115
+ for paragraph in &self.paragraphs {
116
+ if let (Some(_num_id), Some(_level)) = (paragraph.numbering_id, paragraph.numbering_level) {
117
+ let para_text = paragraph.to_text();
118
+ if !para_text.is_empty() {
119
+ text.push_str(&para_text);
120
+ text.push('\n');
121
+ }
122
+ list_index += 1;
123
+ let _ = list_index; // suppress unused warning
124
+ } else {
125
+ let para_text = paragraph.to_text();
126
+ if !para_text.is_empty() {
127
+ text.push_str(&para_text);
128
+ text.push('\n');
129
+ }
130
+ }
131
+ }
132
+
133
+ for table in &self.tables {
134
+ for row in &table.rows {
135
+ for cell in &row.cells {
136
+ for paragraph in &cell.paragraphs {
137
+ let para_text = paragraph.to_text();
138
+ if !para_text.is_empty() {
139
+ text.push_str(&para_text);
140
+ text.push('\t');
141
+ }
142
+ }
143
+ }
144
+ text.push('\n');
145
+ }
146
+ text.push('\n');
147
+ }
148
+
149
+ text
150
+ }
151
+ }
152
+
153
+ impl Paragraph {
154
+ pub fn new() -> Self {
155
+ Self::default()
156
+ }
157
+
158
+ /// Concatenate text runs to produce paragraph text.
159
+ ///
160
+ /// In DOCX, whitespace between words is stored inside `<w:t>` elements
161
+ /// (e.g. `<w:t>Hello </w:t><w:t>World</w:t>`), so runs are joined
162
+ /// directly without adding extra separators. The parser must use
163
+ /// `trim_text(false)` to preserve this whitespace.
164
+ pub fn to_text(&self) -> String {
165
+ let mut text = String::new();
166
+ for run in &self.runs {
167
+ text.push_str(&run.text);
168
+ }
169
+ text
170
+ }
171
+
172
+ pub fn add_run(&mut self, run: Run) {
173
+ self.runs.push(run);
174
+ }
175
+ }
176
+
177
+ impl Run {
178
+ pub fn new(text: String) -> Self {
179
+ Self {
180
+ text,
181
+ ..Default::default()
182
+ }
183
+ }
184
+ }
185
+
186
+ impl Table {
187
+ pub fn new() -> Self {
188
+ Self::default()
189
+ }
190
+ }
191
+
192
+ impl HeaderFooter {
193
+ pub fn extract_text(&self) -> String {
194
+ let mut text = String::new();
195
+
196
+ for paragraph in &self.paragraphs {
197
+ let para_text = paragraph.to_text();
198
+ if !para_text.is_empty() {
199
+ text.push_str(&para_text);
200
+ text.push('\n');
201
+ }
202
+ }
203
+
204
+ for table in &self.tables {
205
+ for row in &table.rows {
206
+ for cell in &row.cells {
207
+ for paragraph in &cell.paragraphs {
208
+ let para_text = paragraph.to_text();
209
+ if !para_text.is_empty() {
210
+ text.push_str(&para_text);
211
+ text.push('\t');
212
+ }
213
+ }
214
+ }
215
+ text.push('\n');
216
+ }
217
+ }
218
+
219
+ text
220
+ }
221
+ }
222
+
223
+ // --- Parser ---
224
+
225
+ struct DocxParser<R: Read + Seek> {
226
+ archive: zip::ZipArchive<R>,
227
+ }
228
+
229
+ impl<R: Read + Seek> DocxParser<R> {
230
+ fn new(reader: R) -> Result<Self, DocxParseError> {
231
+ let archive = zip::ZipArchive::new(reader)?;
232
+ Ok(Self { archive })
233
+ }
234
+
235
+ fn parse(mut self) -> Result<Document, DocxParseError> {
236
+ let mut document = Document::new();
237
+
238
+ let document_xml = self.read_file("word/document.xml")?;
239
+ self.parse_document_xml(&document_xml, &mut document)?;
240
+
241
+ if let Ok(numbering_xml) = self.read_file("word/numbering.xml") {
242
+ let numbering_defs = self.parse_numbering(&numbering_xml)?;
243
+ self.process_lists(&mut document, &numbering_defs);
244
+ }
245
+
246
+ self.parse_headers_footers(&mut document)?;
247
+
248
+ if let Ok(footnotes_xml) = self.read_file("word/footnotes.xml") {
249
+ self.parse_notes(&footnotes_xml, &mut document.footnotes, NoteType::Footnote)?;
250
+ }
251
+
252
+ if let Ok(endnotes_xml) = self.read_file("word/endnotes.xml") {
253
+ self.parse_notes(&endnotes_xml, &mut document.endnotes, NoteType::Endnote)?;
254
+ }
255
+
256
+ Ok(document)
257
+ }
258
+
259
+ fn read_file(&mut self, path: &str) -> Result<String, DocxParseError> {
260
+ let mut file = self
261
+ .archive
262
+ .by_name(path)
263
+ .map_err(|_| DocxParseError::FileNotFound(path.to_string()))?;
264
+
265
+ let mut contents = String::new();
266
+ file.read_to_string(&mut contents)?;
267
+ Ok(contents)
268
+ }
269
+
270
+ fn parse_document_xml(&self, xml: &str, document: &mut Document) -> Result<(), DocxParseError> {
271
+ let mut reader = Reader::from_str(xml);
272
+ reader.config_mut().trim_text(false);
273
+
274
+ let mut buf = Vec::new();
275
+ let mut current_paragraph: Option<Paragraph> = None;
276
+ let mut current_run: Option<Run> = None;
277
+ let mut current_table: Option<Table> = None;
278
+ let mut current_row: Option<TableRow> = None;
279
+ let mut current_cell: Option<TableCell> = None;
280
+ let mut in_text = false;
281
+ let mut in_table = false;
282
+
283
+ loop {
284
+ match reader.read_event_into(&mut buf) {
285
+ Ok(Event::Start(ref e)) => match e.name().as_ref() {
286
+ b"w:p" => {
287
+ if in_table {
288
+ if current_cell.is_none() {
289
+ current_cell = Some(TableCell::default());
290
+ }
291
+ } else {
292
+ current_paragraph = Some(Paragraph::new());
293
+ }
294
+ }
295
+ b"w:numPr" => {
296
+ if let Some(ref mut para) = current_paragraph {
297
+ para.numbering_id = Some(1);
298
+ para.numbering_level = Some(0);
299
+ }
300
+ }
301
+ b"w:r" => {
302
+ current_run = Some(Run::default());
303
+ }
304
+ b"w:t" => {
305
+ in_text = true;
306
+ }
307
+ b"w:tbl" => {
308
+ in_table = true;
309
+ current_table = Some(Table::new());
310
+ }
311
+ b"w:tr" => {
312
+ current_row = Some(TableRow::default());
313
+ }
314
+ b"w:tc" => {
315
+ current_cell = Some(TableCell::default());
316
+ }
317
+ b"w:b" => {
318
+ if let Some(ref mut run) = current_run {
319
+ run.bold = true;
320
+ }
321
+ }
322
+ b"w:i" => {
323
+ if let Some(ref mut run) = current_run {
324
+ run.italic = true;
325
+ }
326
+ }
327
+ b"w:u" => {
328
+ if let Some(ref mut run) = current_run {
329
+ run.underline = true;
330
+ }
331
+ }
332
+ _ => {}
333
+ },
334
+ Ok(Event::Text(e)) => {
335
+ if in_text {
336
+ if let Some(ref mut run) = current_run {
337
+ let text = e.decode()?.into_owned();
338
+ run.text.push_str(&text);
339
+ }
340
+ }
341
+ }
342
+ Ok(Event::End(ref e)) => match e.name().as_ref() {
343
+ b"w:t" => {
344
+ in_text = false;
345
+ }
346
+ b"w:r" => {
347
+ if let Some(run) = current_run.take() {
348
+ if in_table {
349
+ if let Some(ref mut cell) = current_cell {
350
+ if cell.paragraphs.is_empty() {
351
+ cell.paragraphs.push(Paragraph::new());
352
+ }
353
+ if let Some(para) = cell.paragraphs.last_mut() {
354
+ para.add_run(run);
355
+ }
356
+ }
357
+ } else if let Some(ref mut para) = current_paragraph {
358
+ para.add_run(run);
359
+ }
360
+ }
361
+ }
362
+ b"w:p" => {
363
+ if in_table {
364
+ // handled via cell
365
+ } else if let Some(para) = current_paragraph.take() {
366
+ document.paragraphs.push(para);
367
+ }
368
+ }
369
+ b"w:tc" => {
370
+ if let Some(cell) = current_cell.take() {
371
+ if let Some(ref mut row) = current_row {
372
+ row.cells.push(cell);
373
+ }
374
+ }
375
+ }
376
+ b"w:tr" => {
377
+ if let Some(row) = current_row.take() {
378
+ if let Some(ref mut table) = current_table {
379
+ table.rows.push(row);
380
+ }
381
+ }
382
+ }
383
+ b"w:tbl" => {
384
+ in_table = false;
385
+ if let Some(table) = current_table.take() {
386
+ document.tables.push(table);
387
+ }
388
+ }
389
+ _ => {}
390
+ },
391
+ Ok(Event::Eof) => break,
392
+ Err(e) => return Err(e.into()),
393
+ _ => {}
394
+ }
395
+ buf.clear();
396
+ }
397
+
398
+ Ok(())
399
+ }
400
+
401
+ fn parse_numbering(&self, xml: &str) -> Result<HashMap<i64, ListType>, DocxParseError> {
402
+ let mut numbering_defs = HashMap::new();
403
+ let mut reader = Reader::from_str(xml);
404
+ reader.config_mut().trim_text(false);
405
+
406
+ let mut buf = Vec::new();
407
+ let mut current_num_id: Option<i64> = None;
408
+
409
+ loop {
410
+ match reader.read_event_into(&mut buf) {
411
+ Ok(Event::Start(ref e)) => {
412
+ if e.name().as_ref() == b"w:num" {
413
+ for attr in e.attributes().flatten() {
414
+ if attr.key.as_ref() == b"w:numId" {
415
+ if let Ok(id_str) = std::str::from_utf8(&attr.value) {
416
+ current_num_id = id_str.parse().ok();
417
+ }
418
+ }
419
+ }
420
+ }
421
+ }
422
+ Ok(Event::End(ref e)) => {
423
+ if e.name().as_ref() == b"w:num" {
424
+ if let Some(id) = current_num_id {
425
+ numbering_defs.insert(id, ListType::Bullet);
426
+ }
427
+ current_num_id = None;
428
+ }
429
+ }
430
+ Ok(Event::Eof) => break,
431
+ _ => {}
432
+ }
433
+ buf.clear();
434
+ }
435
+
436
+ Ok(numbering_defs)
437
+ }
438
+
439
+ fn process_lists(&self, document: &mut Document, numbering_defs: &HashMap<i64, ListType>) {
440
+ for paragraph in &document.paragraphs {
441
+ if let (Some(num_id), Some(level)) = (paragraph.numbering_id, paragraph.numbering_level) {
442
+ let list_type = numbering_defs.get(&num_id).cloned().unwrap_or(ListType::Bullet);
443
+
444
+ let list_item = ListItem {
445
+ level: level as u32,
446
+ list_type,
447
+ number: None,
448
+ text: paragraph.to_text(),
449
+ };
450
+
451
+ document.lists.push(list_item);
452
+ }
453
+ }
454
+ }
455
+
456
+ fn parse_headers_footers(&mut self, document: &mut Document) -> Result<(), DocxParseError> {
457
+ for i in 1..=3 {
458
+ let header_path = format!("word/header{}.xml", i);
459
+ if let Ok(header_xml) = self.read_file(&header_path) {
460
+ let mut header = HeaderFooter::default();
461
+ self.parse_header_footer_content(&header_xml, &mut header)?;
462
+ document.headers.push(header);
463
+ }
464
+
465
+ let footer_path = format!("word/footer{}.xml", i);
466
+ if let Ok(footer_xml) = self.read_file(&footer_path) {
467
+ let mut footer = HeaderFooter::default();
468
+ self.parse_header_footer_content(&footer_xml, &mut footer)?;
469
+ document.footers.push(footer);
470
+ }
471
+ }
472
+
473
+ Ok(())
474
+ }
475
+
476
+ fn parse_header_footer_content(&self, xml: &str, header_footer: &mut HeaderFooter) -> Result<(), DocxParseError> {
477
+ let mut reader = Reader::from_str(xml);
478
+ reader.config_mut().trim_text(false);
479
+
480
+ let mut buf = Vec::new();
481
+ let mut current_paragraph: Option<Paragraph> = None;
482
+ let mut current_run: Option<Run> = None;
483
+ let mut in_text = false;
484
+
485
+ loop {
486
+ match reader.read_event_into(&mut buf) {
487
+ Ok(Event::Start(ref e)) => match e.name().as_ref() {
488
+ b"w:p" => current_paragraph = Some(Paragraph::new()),
489
+ b"w:r" => current_run = Some(Run::default()),
490
+ b"w:t" => in_text = true,
491
+ _ => {}
492
+ },
493
+ Ok(Event::Text(e)) => {
494
+ if in_text {
495
+ if let Some(ref mut run) = current_run {
496
+ let text = e.decode()?.into_owned();
497
+ run.text.push_str(&text);
498
+ }
499
+ }
500
+ }
501
+ Ok(Event::End(ref e)) => match e.name().as_ref() {
502
+ b"w:t" => in_text = false,
503
+ b"w:r" => {
504
+ if let Some(run) = current_run.take() {
505
+ if let Some(ref mut para) = current_paragraph {
506
+ para.add_run(run);
507
+ }
508
+ }
509
+ }
510
+ b"w:p" => {
511
+ if let Some(para) = current_paragraph.take() {
512
+ header_footer.paragraphs.push(para);
513
+ }
514
+ }
515
+ _ => {}
516
+ },
517
+ Ok(Event::Eof) => break,
518
+ _ => {}
519
+ }
520
+ buf.clear();
521
+ }
522
+
523
+ Ok(())
524
+ }
525
+
526
+ fn parse_notes(&self, xml: &str, notes: &mut Vec<Note>, note_type: NoteType) -> Result<(), DocxParseError> {
527
+ let mut reader = Reader::from_str(xml);
528
+ reader.config_mut().trim_text(false);
529
+
530
+ let mut buf = Vec::new();
531
+ let mut current_note: Option<Note> = None;
532
+ let mut current_paragraph: Option<Paragraph> = None;
533
+ let mut current_run: Option<Run> = None;
534
+ let mut in_text = false;
535
+
536
+ loop {
537
+ match reader.read_event_into(&mut buf) {
538
+ Ok(Event::Start(ref e)) => match e.name().as_ref() {
539
+ b"w:footnote" | b"w:endnote" => {
540
+ let mut id = String::new();
541
+ for attr in e.attributes().flatten() {
542
+ if attr.key.as_ref() == b"w:id" {
543
+ id = String::from_utf8_lossy(&attr.value).to_string();
544
+ }
545
+ }
546
+ current_note = Some(Note {
547
+ id,
548
+ note_type: note_type.clone(),
549
+ paragraphs: Vec::new(),
550
+ });
551
+ }
552
+ b"w:p" => current_paragraph = Some(Paragraph::new()),
553
+ b"w:r" => current_run = Some(Run::default()),
554
+ b"w:t" => in_text = true,
555
+ _ => {}
556
+ },
557
+ Ok(Event::Text(e)) => {
558
+ if in_text {
559
+ if let Some(ref mut run) = current_run {
560
+ let text = e.decode()?.into_owned();
561
+ run.text.push_str(&text);
562
+ }
563
+ }
564
+ }
565
+ Ok(Event::End(ref e)) => match e.name().as_ref() {
566
+ b"w:t" => in_text = false,
567
+ b"w:r" => {
568
+ if let Some(run) = current_run.take() {
569
+ if let Some(ref mut para) = current_paragraph {
570
+ para.add_run(run);
571
+ }
572
+ }
573
+ }
574
+ b"w:p" => {
575
+ if let Some(para) = current_paragraph.take() {
576
+ if let Some(ref mut note) = current_note {
577
+ note.paragraphs.push(para);
578
+ }
579
+ }
580
+ }
581
+ b"w:footnote" | b"w:endnote" => {
582
+ if let Some(note) = current_note.take() {
583
+ if note.id != "-1" && note.id != "0" {
584
+ notes.push(note);
585
+ }
586
+ }
587
+ }
588
+ _ => {}
589
+ },
590
+ Ok(Event::Eof) => break,
591
+ _ => {}
592
+ }
593
+ buf.clear();
594
+ }
595
+
596
+ Ok(())
597
+ }
598
+ }
599
+
600
+ // --- Error ---
601
+
602
+ #[derive(Debug, thiserror::Error)]
603
+ enum DocxParseError {
604
+ #[error("IO error: {0}")]
605
+ Io(#[from] std::io::Error),
606
+
607
+ #[error("ZIP error: {0}")]
608
+ Zip(#[from] zip::result::ZipError),
609
+
610
+ #[error("XML parsing error: {0}")]
611
+ Xml(#[from] quick_xml::Error),
612
+
613
+ #[error("Required file not found in DOCX: {0}")]
614
+ FileNotFound(String),
615
+ }
616
+
617
+ // quick-xml's unescape returns an encoding error type
618
+ impl From<quick_xml::encoding::EncodingError> for DocxParseError {
619
+ fn from(e: quick_xml::encoding::EncodingError) -> Self {
620
+ DocxParseError::Xml(quick_xml::Error::Encoding(e))
621
+ }
622
+ }
623
+
624
+ // --- Public API ---
625
+
626
+ /// Parse a DOCX document from bytes and return the structured document.
627
+ pub fn parse_document(bytes: &[u8]) -> crate::error::Result<Document> {
628
+ let cursor = Cursor::new(bytes);
629
+ let parser = DocxParser::new(cursor)
630
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
631
+ parser
632
+ .parse()
633
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))
634
+ }
635
+
636
+ /// Extract text from DOCX bytes.
637
+ pub fn extract_text_from_bytes(bytes: &[u8]) -> crate::error::Result<String> {
638
+ let doc = parse_document(bytes)?;
639
+ Ok(doc.extract_text())
640
+ }
641
+
642
+ #[cfg(test)]
643
+ mod tests {
644
+ use super::*;
645
+
646
+ /// Runs are concatenated directly; whitespace comes from the XML text content.
647
+ #[test]
648
+ fn test_paragraph_to_text_concatenates_runs() {
649
+ let mut para = Paragraph::new();
650
+ para.add_run(Run::new("Hello ".to_string()));
651
+ para.add_run(Run::new("World".to_string()));
652
+ assert_eq!(para.to_text(), "Hello World");
653
+ }
654
+
655
+ /// Mid-word run splits (e.g. drop caps) must not insert extra spaces.
656
+ #[test]
657
+ fn test_paragraph_to_text_mid_word_split() {
658
+ let mut para = Paragraph::new();
659
+ para.add_run(Run::new("S".to_string()));
660
+ para.add_run(Run::new("ermocination".to_string()));
661
+ assert_eq!(para.to_text(), "Sermocination");
662
+ }
663
+
664
+ #[test]
665
+ fn test_paragraph_to_text_single_run() {
666
+ let mut para = Paragraph::new();
667
+ para.add_run(Run::new("Hello".to_string()));
668
+ assert_eq!(para.to_text(), "Hello");
669
+ }
670
+
671
+ #[test]
672
+ fn test_paragraph_to_text_no_runs() {
673
+ let para = Paragraph::new();
674
+ assert_eq!(para.to_text(), "");
675
+ }
676
+
677
+ /// Whitespace between words is stored in the run text, not added by join.
678
+ #[test]
679
+ fn test_paragraph_to_text_whitespace_in_runs() {
680
+ let mut para = Paragraph::new();
681
+ para.add_run(Run::new("The ".to_string()));
682
+ para.add_run(Run::new("quick ".to_string()));
683
+ para.add_run(Run::new("fox".to_string()));
684
+ assert_eq!(para.to_text(), "The quick fox");
685
+ }
686
+ }
@@ -1,6 +1,6 @@
1
1
  #![cfg(all(feature = "tokio-runtime", feature = "office"))]
2
2
 
3
- //! DOCX extractor using docx-lite for high-performance text extraction.
3
+ //! DOCX extractor for high-performance text extraction.
4
4
  //!
5
5
  //! Supports: Microsoft Word (.docx)
6
6
 
@@ -14,10 +14,10 @@ use async_trait::async_trait;
14
14
  use std::borrow::Cow;
15
15
  use std::io::Cursor;
16
16
 
17
- /// High-performance DOCX extractor using docx-lite.
17
+ /// High-performance DOCX extractor.
18
18
  ///
19
19
  /// This extractor provides:
20
- /// - Fast text extraction via streaming XML parsing (~160 MB/s average)
20
+ /// - Fast text extraction via streaming XML parsing
21
21
  /// - Comprehensive metadata extraction (core.xml, app.xml, custom.xml)
22
22
  pub struct DocxExtractor;
23
23
 
@@ -52,7 +52,7 @@ impl Plugin for DocxExtractor {
52
52
  }
53
53
 
54
54
  fn description(&self) -> &str {
55
- "High-performance DOCX text extraction using docx-lite with metadata support"
55
+ "High-performance DOCX text extraction with metadata support"
56
56
  }
57
57
 
58
58
  fn author(&self) -> &str {
@@ -60,15 +60,15 @@ impl Plugin for DocxExtractor {
60
60
  }
61
61
  }
62
62
 
63
- /// Convert docx-lite table to Kreuzberg Table struct with markdown representation.
63
+ /// Convert parsed DOCX table to Kreuzberg Table struct with markdown representation.
64
64
  ///
65
65
  /// # Arguments
66
- /// * `docx_table` - The table from docx-lite library
66
+ /// * `docx_table` - The parsed DOCX table
67
67
  /// * `table_index` - Index of the table in the document (used as page_number)
68
68
  ///
69
69
  /// # Returns
70
70
  /// * `Table` - Converted table with cells and markdown representation
71
- fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize) -> Table {
71
+ fn convert_docx_table_to_table(docx_table: &crate::extraction::docx::parser::Table, table_index: usize) -> Table {
72
72
  let cells: Vec<Vec<String>> = docx_table
73
73
  .rows
74
74
  .iter()
@@ -97,14 +97,6 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
97
97
  }
98
98
  }
99
99
 
100
- /// Convert 2D cell data to markdown table format.
101
- ///
102
- /// # Arguments
103
- /// * `cells` - 2D vector of cell strings (rows × columns)
104
- ///
105
- /// # Returns
106
- /// * `String` - Markdown formatted table
107
-
108
100
  #[async_trait]
109
101
  impl DocumentExtractor for DocxExtractor {
110
102
  #[cfg_attr(feature = "otel", tracing::instrument(
@@ -126,9 +118,7 @@ impl DocumentExtractor for DocxExtractor {
126
118
  tokio::task::spawn_blocking(
127
119
  move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
128
120
  let _guard = span.entered();
129
- let cursor = Cursor::new(&content_owned);
130
- let doc = docx_lite::parse_document(cursor)
131
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
121
+ let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
132
122
 
133
123
  let text = doc.extract_text();
134
124
 
@@ -147,9 +137,7 @@ impl DocumentExtractor for DocxExtractor {
147
137
  .await
148
138
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
149
139
  } else {
150
- let cursor = Cursor::new(content);
151
- let doc = docx_lite::parse_document(cursor)
152
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
140
+ let doc = crate::extraction::docx::parser::parse_document(content)?;
153
141
 
154
142
  let text = doc.extract_text();
155
143
 
@@ -373,7 +361,7 @@ mod tests {
373
361
 
374
362
  #[test]
375
363
  fn test_convert_docx_table_to_table() {
376
- use docx_lite::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
364
+ use crate::extraction::docx::parser::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
377
365
 
378
366
  let mut table = DocxTable::new();
379
367
 
@@ -0,0 +1,33 @@
1
+ //! Regression test for https://github.com/kreuzberg-dev/kreuzberg/issues/359
2
+ //!
3
+ //! DOCX list items with multiple text runs should preserve whitespace between runs.
4
+ //! e.g. "Sermocination ypsiliform" must not become "Sermocinationypsiliform".
5
+
6
+ #![cfg(feature = "office")]
7
+
8
+ use kreuzberg::{ExtractionConfig, extract_file};
9
+
10
+ #[tokio::test]
11
+ async fn test_issue_359_docx_list_run_whitespace() {
12
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
13
+ .parent()
14
+ .expect("parent")
15
+ .parent()
16
+ .expect("workspace root");
17
+ let test_file = workspace_root.join("test_documents/docx/issue_359_list_whitespace.docx");
18
+
19
+ if !test_file.exists() {
20
+ println!("Skipping test: {:?} not found", test_file);
21
+ return;
22
+ }
23
+
24
+ let result = extract_file(&test_file, None, &ExtractionConfig::default())
25
+ .await
26
+ .expect("Should extract DOCX successfully");
27
+
28
+ assert!(
29
+ result.content.contains("Sermocination ypsiliform"),
30
+ "Expected 'Sermocination ypsiliform' with space between runs, got: {:?}",
31
+ result.content
32
+ );
33
+ }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.11"
3
+ version = "4.2.12"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -26,7 +26,7 @@ image = { workspace = true, features = ["png"] }
26
26
  [build-dependencies]
27
27
  cc = { version = "^1.2.55", optional = true }
28
28
  cmake = { version = "0.1.57", optional = true }
29
- zip = { version = "7.3.0", optional = true }
29
+ zip = { version = "7.4.0", optional = true }
30
30
 
31
31
  # Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
32
32
  [target.'cfg(target_os = "windows")'.build-dependencies]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.11
4
+ version: 4.2.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
@@ -427,7 +427,8 @@ files:
427
427
  - vendor/kreuzberg/src/extraction/archive/tar.rs
428
428
  - vendor/kreuzberg/src/extraction/archive/zip.rs
429
429
  - vendor/kreuzberg/src/extraction/capacity.rs
430
- - vendor/kreuzberg/src/extraction/docx.rs
430
+ - vendor/kreuzberg/src/extraction/docx/mod.rs
431
+ - vendor/kreuzberg/src/extraction/docx/parser.rs
431
432
  - vendor/kreuzberg/src/extraction/email.rs
432
433
  - vendor/kreuzberg/src/extraction/excel.rs
433
434
  - vendor/kreuzberg/src/extraction/html/converter.rs
@@ -745,6 +746,7 @@ files:
745
746
  - vendor/kreuzberg/tests/image_integration.rs
746
747
  - vendor/kreuzberg/tests/instrumentation_test.rs
747
748
  - vendor/kreuzberg/tests/issue_350_regression_test.rs
749
+ - vendor/kreuzberg/tests/issue_359_list_whitespace_test.rs
748
750
  - vendor/kreuzberg/tests/jats_extractor_tests.rs
749
751
  - vendor/kreuzberg/tests/jupyter_extractor_tests.rs
750
752
  - vendor/kreuzberg/tests/keywords_integration.rs