kreuzberg 4.2.11 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +2 -2
  7. data/vendor/kreuzberg/Cargo.toml +24 -9
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
  10. data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
  11. data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
  12. data/vendor/kreuzberg/src/core/mime.rs +47 -2
  13. data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
  14. data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
  15. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
  16. data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
  17. data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
  18. data/vendor/kreuzberg/src/extraction/{docx.rs → docx/mod.rs} +7 -17
  19. data/vendor/kreuzberg/src/extraction/docx/parser.rs +686 -0
  20. data/vendor/kreuzberg/src/extraction/image.rs +405 -18
  21. data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
  22. data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
  23. data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
  24. data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
  25. data/vendor/kreuzberg/src/extractors/docx.rs +10 -22
  26. data/vendor/kreuzberg/src/extractors/image.rs +25 -0
  27. data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
  28. data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
  29. data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
  30. data/vendor/kreuzberg/src/extractors/security.rs +2 -1
  31. data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
  32. data/vendor/kreuzberg/src/extractors/text.rs +33 -4
  33. data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
  34. data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
  35. data/vendor/kreuzberg/tests/issue_359_list_whitespace_test.rs +33 -0
  36. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  37. metadata +7 -3
@@ -0,0 +1,563 @@
1
+ //! Citation format extractors for RIS, PubMed/MEDLINE, and EndNote XML.
2
+ //!
3
+ //! Extracts and parses citation files in various formats, providing structured access
4
+ //! to bibliography entries, metadata, and author information.
5
+
6
+ use crate::Result;
7
+ use crate::core::config::ExtractionConfig;
8
+ use crate::plugins::{DocumentExtractor, Plugin};
9
+ use crate::types::{ExtractionResult, Metadata};
10
+ use ahash::AHashMap;
11
+ use async_trait::async_trait;
12
+ use std::borrow::Cow;
13
+ use std::collections::HashSet;
14
+
15
+ #[cfg(feature = "office")]
16
+ use biblib::{CitationParser, EndNoteXmlParser, PubMedParser, RisParser};
17
+
18
+ /// Citation format extractor for RIS, PubMed/MEDLINE, and EndNote XML formats.
19
+ ///
20
+ /// Parses citation files and extracts structured bibliography data including
21
+ /// entries, authors, publication years, and format-specific metadata.
22
+ pub struct CitationExtractor;
23
+
24
+ impl CitationExtractor {
25
+ /// Create a new citation extractor.
26
+ pub fn new() -> Self {
27
+ Self
28
+ }
29
+ }
30
+
31
+ impl Default for CitationExtractor {
32
+ fn default() -> Self {
33
+ Self::new()
34
+ }
35
+ }
36
+
37
+ impl Plugin for CitationExtractor {
38
+ fn name(&self) -> &str {
39
+ "citation-extractor"
40
+ }
41
+
42
+ fn version(&self) -> String {
43
+ env!("CARGO_PKG_VERSION").to_string()
44
+ }
45
+
46
+ fn initialize(&self) -> Result<()> {
47
+ Ok(())
48
+ }
49
+
50
+ fn shutdown(&self) -> Result<()> {
51
+ Ok(())
52
+ }
53
+
54
+ fn description(&self) -> &str {
55
+ "Extracts and parses citation files (RIS, PubMed/MEDLINE, EndNote XML) with structured metadata"
56
+ }
57
+
58
+ fn author(&self) -> &str {
59
+ "Kreuzberg Team"
60
+ }
61
+ }
62
+
63
+ #[cfg(feature = "office")]
64
+ #[async_trait]
65
+ impl DocumentExtractor for CitationExtractor {
66
+ #[cfg_attr(feature = "otel", tracing::instrument(
67
+ skip(self, content, _config),
68
+ fields(
69
+ extractor.name = self.name(),
70
+ content.size_bytes = content.len(),
71
+ )
72
+ ))]
73
+ async fn extract_bytes(
74
+ &self,
75
+ content: &[u8],
76
+ mime_type: &str,
77
+ _config: &ExtractionConfig,
78
+ ) -> Result<ExtractionResult> {
79
+ let citation_str = String::from_utf8_lossy(content);
80
+
81
+ let mut citations_vec = Vec::new();
82
+ let mut authors_set = HashSet::new();
83
+ let mut years_set = HashSet::new();
84
+ let mut dois_vec = Vec::new();
85
+ let mut keywords_set = HashSet::new();
86
+ let mut formatted_content = String::new();
87
+
88
+ // Parse based on MIME type
89
+ let (parse_result, format_string) = match mime_type {
90
+ "application/x-research-info-systems" => (RisParser::new().parse(&citation_str), "RIS"),
91
+ "application/x-pubmed" => (PubMedParser::new().parse(&citation_str), "PubMed"),
92
+ "application/x-endnote+xml" => (EndNoteXmlParser::new().parse(&citation_str), "EndNote XML"),
93
+ _ => {
94
+ // Fallback: return raw content if MIME type is unexpected
95
+ let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
96
+ additional.insert(Cow::Borrowed("citation_count"), serde_json::json!(0));
97
+ additional.insert(Cow::Borrowed("format"), serde_json::json!("Unknown"));
98
+
99
+ return Ok(ExtractionResult {
100
+ content: citation_str.to_string(),
101
+ mime_type: mime_type.to_string().into(),
102
+ metadata: Metadata {
103
+ additional,
104
+ ..Default::default()
105
+ },
106
+ pages: None,
107
+ tables: vec![],
108
+ detected_languages: None,
109
+ chunks: None,
110
+ images: None,
111
+ djot_content: None,
112
+ elements: None,
113
+ });
114
+ }
115
+ };
116
+
117
+ match parse_result {
118
+ Ok(citations) => {
119
+ for citation in &citations {
120
+ citations_vec.push(citation.title.clone());
121
+
122
+ // Collect authors
123
+ for author in &citation.authors {
124
+ let author_name = if let Some(given) = &author.given_name {
125
+ format!("{} {}", given, author.name)
126
+ } else {
127
+ author.name.clone()
128
+ };
129
+ if !author_name.is_empty() {
130
+ authors_set.insert(author_name);
131
+ }
132
+ }
133
+
134
+ // Collect years
135
+ if let Some(date) = &citation.date {
136
+ if date.year > 0 {
137
+ years_set.insert(date.year as u32);
138
+ }
139
+ }
140
+
141
+ // Collect DOIs
142
+ if let Some(doi) = &citation.doi {
143
+ if !doi.is_empty() {
144
+ dois_vec.push(doi.clone());
145
+ }
146
+ }
147
+
148
+ // Collect keywords
149
+ for keyword in &citation.keywords {
150
+ if !keyword.is_empty() {
151
+ keywords_set.insert(keyword.clone());
152
+ }
153
+ }
154
+
155
+ // Format citation as readable text
156
+ if !citation.title.is_empty() {
157
+ formatted_content.push_str(&format!("Title: {}\n", citation.title));
158
+ }
159
+
160
+ if !citation.authors.is_empty() {
161
+ let author_strings: Vec<String> = citation
162
+ .authors
163
+ .iter()
164
+ .map(|a| {
165
+ if let Some(given) = &a.given_name {
166
+ format!("{} {}", given, a.name)
167
+ } else {
168
+ a.name.clone()
169
+ }
170
+ })
171
+ .collect();
172
+ formatted_content.push_str(&format!("Authors: {}\n", author_strings.join(", ")));
173
+ }
174
+
175
+ if let Some(journal) = &citation.journal {
176
+ formatted_content.push_str(&format!("Journal: {}\n", journal));
177
+ }
178
+
179
+ if let Some(date) = &citation.date {
180
+ formatted_content.push_str(&format!("Year: {}\n", date.year));
181
+ }
182
+
183
+ if let Some(volume) = &citation.volume {
184
+ formatted_content.push_str(&format!("Volume: {}", volume));
185
+ if let Some(issue) = &citation.issue {
186
+ formatted_content.push_str(&format!(", Issue: {}", issue));
187
+ }
188
+ if let Some(pages) = &citation.pages {
189
+ formatted_content.push_str(&format!(", Pages: {}", pages));
190
+ }
191
+ formatted_content.push('\n');
192
+ }
193
+
194
+ if let Some(doi) = &citation.doi {
195
+ formatted_content.push_str(&format!("DOI: {}\n", doi));
196
+ }
197
+
198
+ if let Some(pmid) = &citation.pmid {
199
+ formatted_content.push_str(&format!("PMID: {}\n", pmid));
200
+ }
201
+
202
+ if let Some(abstract_text) = &citation.abstract_text {
203
+ if !abstract_text.is_empty() {
204
+ formatted_content.push_str(&format!("Abstract: {}\n", abstract_text));
205
+ }
206
+ }
207
+
208
+ if !citation.keywords.is_empty() {
209
+ formatted_content.push_str(&format!("Keywords: {}\n", citation.keywords.join(", ")));
210
+ }
211
+
212
+ formatted_content.push_str("---\n");
213
+ }
214
+ }
215
+ Err(_err) => {
216
+ #[cfg(feature = "otel")]
217
+ tracing::warn!("Citation parsing failed, returning raw content: {}", _err);
218
+ formatted_content = citation_str.to_string();
219
+ }
220
+ }
221
+
222
+ let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
223
+
224
+ additional.insert(Cow::Borrowed("citation_count"), serde_json::json!(citations_vec.len()));
225
+
226
+ let mut authors_list: Vec<String> = authors_set.into_iter().collect();
227
+ authors_list.sort();
228
+ additional.insert(Cow::Borrowed("authors"), serde_json::json!(authors_list));
229
+
230
+ if !years_set.is_empty() {
231
+ let min_year = years_set.iter().min().copied().unwrap_or(0);
232
+ let max_year = years_set.iter().max().copied().unwrap_or(0);
233
+ let mut years_sorted: Vec<u32> = years_set.into_iter().collect();
234
+ years_sorted.sort_unstable();
235
+ additional.insert(
236
+ Cow::Borrowed("year_range"),
237
+ serde_json::json!({
238
+ "min": min_year,
239
+ "max": max_year,
240
+ "years": years_sorted
241
+ }),
242
+ );
243
+ }
244
+
245
+ if !dois_vec.is_empty() {
246
+ additional.insert(Cow::Borrowed("dois"), serde_json::json!(dois_vec));
247
+ }
248
+
249
+ let mut keywords_list: Vec<String> = keywords_set.into_iter().collect();
250
+ keywords_list.sort();
251
+ if !keywords_list.is_empty() {
252
+ additional.insert(Cow::Borrowed("keywords"), serde_json::json!(keywords_list));
253
+ }
254
+
255
+ additional.insert(Cow::Borrowed("format"), serde_json::json!(format_string));
256
+
257
+ Ok(ExtractionResult {
258
+ content: formatted_content,
259
+ mime_type: mime_type.to_string().into(),
260
+ metadata: Metadata {
261
+ additional,
262
+ ..Default::default()
263
+ },
264
+ pages: None,
265
+ tables: vec![],
266
+ detected_languages: None,
267
+ chunks: None,
268
+ images: None,
269
+ djot_content: None,
270
+ elements: None,
271
+ })
272
+ }
273
+
274
+ fn supported_mime_types(&self) -> &[&str] {
275
+ &[
276
+ "application/x-research-info-systems",
277
+ "application/x-pubmed",
278
+ "application/x-endnote+xml",
279
+ ]
280
+ }
281
+
282
+ fn priority(&self) -> i32 {
283
+ 60
284
+ }
285
+ }
286
+
287
+ #[cfg(all(test, feature = "office"))]
288
+ mod tests {
289
+ use super::*;
290
+
291
+ #[tokio::test]
292
+ async fn test_can_extract_citation_mime_types() {
293
+ let extractor = CitationExtractor::new();
294
+ let supported = extractor.supported_mime_types();
295
+
296
+ assert!(supported.contains(&"application/x-research-info-systems"));
297
+ assert!(supported.contains(&"application/x-pubmed"));
298
+ assert!(supported.contains(&"application/x-endnote+xml"));
299
+ assert_eq!(supported.len(), 3);
300
+ }
301
+
302
+ #[tokio::test]
303
+ async fn test_extract_simple_ris() {
304
+ let extractor = CitationExtractor::new();
305
+ let ris_content = br#"TY - JOUR
306
+ TI - Sample Title
307
+ AU - Smith, John
308
+ PY - 2023
309
+ ER -"#;
310
+
311
+ let config = ExtractionConfig::default();
312
+ let result = extractor
313
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
314
+ .await;
315
+
316
+ assert!(result.is_ok());
317
+ let result = result.expect("Should extract valid RIS entry");
318
+
319
+ assert!(result.content.contains("Sample Title"));
320
+ assert!(result.content.contains("Smith"));
321
+
322
+ let metadata = &result.metadata;
323
+ assert_eq!(
324
+ metadata.additional.get(&Cow::Borrowed("citation_count")),
325
+ Some(&serde_json::json!(1))
326
+ );
327
+ assert_eq!(
328
+ metadata.additional.get(&Cow::Borrowed("format")),
329
+ Some(&serde_json::json!("RIS"))
330
+ );
331
+ }
332
+
333
+ #[tokio::test]
334
+ async fn test_extract_multiple_ris_entries() {
335
+ let extractor = CitationExtractor::new();
336
+ let ris_content = br#"TY - JOUR
337
+ TI - First Paper
338
+ AU - Author One
339
+ PY - 2020
340
+ ER -
341
+
342
+ TY - JOUR
343
+ TI - Second Paper
344
+ AU - Author Two
345
+ PY - 2021
346
+ ER -"#;
347
+
348
+ let config = ExtractionConfig::default();
349
+ let result = extractor
350
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
351
+ .await;
352
+
353
+ assert!(result.is_ok());
354
+ let result = result.expect("Should extract multiple RIS entries");
355
+
356
+ let metadata = &result.metadata;
357
+
358
+ assert_eq!(
359
+ metadata.additional.get(&Cow::Borrowed("citation_count")),
360
+ Some(&serde_json::json!(2))
361
+ );
362
+
363
+ if let Some(year_range) = metadata.additional.get("year_range") {
364
+ assert_eq!(year_range.get("min"), Some(&serde_json::json!(2020)));
365
+ assert_eq!(year_range.get("max"), Some(&serde_json::json!(2021)));
366
+ }
367
+ }
368
+
369
+ #[tokio::test]
370
+ async fn test_extract_ris_with_doi() {
371
+ let extractor = CitationExtractor::new();
372
+ let ris_content = br#"TY - JOUR
373
+ TI - Sample Article
374
+ AU - Smith, John
375
+ DO - 10.1234/example.doi
376
+ PY - 2023
377
+ ER -"#;
378
+
379
+ let config = ExtractionConfig::default();
380
+ let result = extractor
381
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
382
+ .await;
383
+
384
+ assert!(result.is_ok());
385
+ let result = result.expect("Should extract RIS with DOI");
386
+
387
+ let metadata = &result.metadata;
388
+ if let Some(dois) = metadata.additional.get("dois") {
389
+ assert!(!dois.as_array().unwrap().is_empty());
390
+ }
391
+ }
392
+
393
+ #[tokio::test]
394
+ async fn test_extract_empty_citation_file() {
395
+ let extractor = CitationExtractor::new();
396
+ let empty_content = b"";
397
+
398
+ let config = ExtractionConfig::default();
399
+ let result = extractor
400
+ .extract_bytes(empty_content, "application/x-research-info-systems", &config)
401
+ .await;
402
+
403
+ assert!(result.is_ok());
404
+ let result = result.expect("Should handle empty citation file");
405
+
406
+ let metadata = &result.metadata;
407
+
408
+ assert_eq!(
409
+ metadata.additional.get(&Cow::Borrowed("citation_count")),
410
+ Some(&serde_json::json!(0))
411
+ );
412
+ }
413
+
414
+ #[tokio::test]
415
+ async fn test_extract_malformed_ris() {
416
+ let extractor = CitationExtractor::new();
417
+ let malformed_content = b"This is not valid RIS format\nJust some random text";
418
+
419
+ let config = ExtractionConfig::default();
420
+ let result = extractor
421
+ .extract_bytes(malformed_content, "application/x-research-info-systems", &config)
422
+ .await;
423
+
424
+ assert!(result.is_ok());
425
+ let result = result.expect("Should extract malformed as raw content");
426
+
427
+ // When RIS parser encounters unparseable content, it may return empty results
428
+ // Verify we get a result either way
429
+ let metadata = &result.metadata;
430
+ assert_eq!(
431
+ metadata.additional.get(&Cow::Borrowed("citation_count")),
432
+ Some(&serde_json::json!(0))
433
+ );
434
+ }
435
+
436
+ #[tokio::test]
437
+ async fn test_citation_extractor_plugin_interface() {
438
+ let extractor = CitationExtractor::new();
439
+ assert_eq!(extractor.name(), "citation-extractor");
440
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
441
+ assert_eq!(extractor.priority(), 60);
442
+ assert!(!extractor.supported_mime_types().is_empty());
443
+ }
444
+
445
+ #[test]
446
+ fn test_citation_extractor_default() {
447
+ let extractor = CitationExtractor;
448
+ assert_eq!(extractor.name(), "citation-extractor");
449
+ }
450
+
451
+ #[tokio::test]
452
+ async fn test_citation_extractor_initialize_shutdown() {
453
+ let extractor = CitationExtractor::new();
454
+ assert!(extractor.initialize().is_ok());
455
+ assert!(extractor.shutdown().is_ok());
456
+ }
457
+
458
+ #[tokio::test]
459
+ async fn test_extract_ris_with_keywords() {
460
+ let extractor = CitationExtractor::new();
461
+ let ris_content = br#"TY - JOUR
462
+ TI - Sample Article
463
+ AU - Smith, John
464
+ KW - keyword1
465
+ KW - keyword2
466
+ KW - keyword3
467
+ PY - 2023
468
+ ER -"#;
469
+
470
+ let config = ExtractionConfig::default();
471
+ let result = extractor
472
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
473
+ .await;
474
+
475
+ assert!(result.is_ok());
476
+ let result = result.expect("Should extract RIS with keywords");
477
+
478
+ let metadata = &result.metadata;
479
+ if let Some(keywords) = metadata.additional.get("keywords") {
480
+ assert!(!keywords.as_array().unwrap().is_empty());
481
+ }
482
+ }
483
+
484
+ #[tokio::test]
485
+ async fn test_extract_ris_with_multiple_authors() {
486
+ let extractor = CitationExtractor::new();
487
+ let ris_content = br#"TY - JOUR
488
+ TI - Collaborative Work
489
+ AU - First Author
490
+ AU - Second Author
491
+ AU - Third Author
492
+ PY - 2023
493
+ ER -"#;
494
+
495
+ let config = ExtractionConfig::default();
496
+ let result = extractor
497
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
498
+ .await;
499
+
500
+ assert!(result.is_ok());
501
+ let result = result.expect("Should extract multiple authors");
502
+
503
+ let metadata = &result.metadata;
504
+ if let Some(authors) = metadata.additional.get("authors") {
505
+ assert!(!authors.as_array().unwrap().is_empty());
506
+ }
507
+ }
508
+
509
+ #[tokio::test]
510
+ async fn test_extract_pubmed_format() {
511
+ let extractor = CitationExtractor::new();
512
+ let pubmed_content = br#"PMID- 12345678
513
+ TI - Sample PubMed Article
514
+ FAU - Smith, John
515
+ DP - 2023"#;
516
+
517
+ let config = ExtractionConfig::default();
518
+ let result = extractor
519
+ .extract_bytes(pubmed_content, "application/x-pubmed", &config)
520
+ .await;
521
+
522
+ assert!(result.is_ok());
523
+ let result = result.expect("Should extract PubMed format");
524
+
525
+ let metadata = &result.metadata;
526
+ assert_eq!(
527
+ metadata.additional.get(&Cow::Borrowed("format")),
528
+ Some(&serde_json::json!("PubMed"))
529
+ );
530
+ }
531
+
532
+ #[tokio::test]
533
+ async fn test_extract_endnote_xml_format() {
534
+ let extractor = CitationExtractor::new();
535
+ let endnote_content = br#"<?xml version="1.0" encoding="UTF-8"?>
536
+ <xml>
537
+ <records>
538
+ <record>
539
+ <titles>
540
+ <title>Sample EndNote Article</title>
541
+ </titles>
542
+ <authors>
543
+ <author>Smith, John</author>
544
+ </authors>
545
+ </record>
546
+ </records>
547
+ </xml>"#;
548
+
549
+ let config = ExtractionConfig::default();
550
+ let result = extractor
551
+ .extract_bytes(endnote_content, "application/x-endnote+xml", &config)
552
+ .await;
553
+
554
+ assert!(result.is_ok());
555
+ let result = result.expect("Should extract EndNote XML format");
556
+
557
+ let metadata = &result.metadata;
558
+ assert_eq!(
559
+ metadata.additional.get(&Cow::Borrowed("format")),
560
+ Some(&serde_json::json!("EndNote XML"))
561
+ );
562
+ }
563
+ }
@@ -1,6 +1,6 @@
1
1
  #![cfg(all(feature = "tokio-runtime", feature = "office"))]
2
2
 
3
- //! DOCX extractor using docx-lite for high-performance text extraction.
3
+ //! DOCX extractor for high-performance text extraction.
4
4
  //!
5
5
  //! Supports: Microsoft Word (.docx)
6
6
 
@@ -14,10 +14,10 @@ use async_trait::async_trait;
14
14
  use std::borrow::Cow;
15
15
  use std::io::Cursor;
16
16
 
17
- /// High-performance DOCX extractor using docx-lite.
17
+ /// High-performance DOCX extractor.
18
18
  ///
19
19
  /// This extractor provides:
20
- /// - Fast text extraction via streaming XML parsing (~160 MB/s average)
20
+ /// - Fast text extraction via streaming XML parsing
21
21
  /// - Comprehensive metadata extraction (core.xml, app.xml, custom.xml)
22
22
  pub struct DocxExtractor;
23
23
 
@@ -52,7 +52,7 @@ impl Plugin for DocxExtractor {
52
52
  }
53
53
 
54
54
  fn description(&self) -> &str {
55
- "High-performance DOCX text extraction using docx-lite with metadata support"
55
+ "High-performance DOCX text extraction with metadata support"
56
56
  }
57
57
 
58
58
  fn author(&self) -> &str {
@@ -60,15 +60,15 @@ impl Plugin for DocxExtractor {
60
60
  }
61
61
  }
62
62
 
63
- /// Convert docx-lite table to Kreuzberg Table struct with markdown representation.
63
+ /// Convert parsed DOCX table to Kreuzberg Table struct with markdown representation.
64
64
  ///
65
65
  /// # Arguments
66
- /// * `docx_table` - The table from docx-lite library
66
+ /// * `docx_table` - The parsed DOCX table
67
67
  /// * `table_index` - Index of the table in the document (used as page_number)
68
68
  ///
69
69
  /// # Returns
70
70
  /// * `Table` - Converted table with cells and markdown representation
71
- fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize) -> Table {
71
+ fn convert_docx_table_to_table(docx_table: &crate::extraction::docx::parser::Table, table_index: usize) -> Table {
72
72
  let cells: Vec<Vec<String>> = docx_table
73
73
  .rows
74
74
  .iter()
@@ -97,14 +97,6 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
97
97
  }
98
98
  }
99
99
 
100
- /// Convert 2D cell data to markdown table format.
101
- ///
102
- /// # Arguments
103
- /// * `cells` - 2D vector of cell strings (rows × columns)
104
- ///
105
- /// # Returns
106
- /// * `String` - Markdown formatted table
107
-
108
100
  #[async_trait]
109
101
  impl DocumentExtractor for DocxExtractor {
110
102
  #[cfg_attr(feature = "otel", tracing::instrument(
@@ -126,9 +118,7 @@ impl DocumentExtractor for DocxExtractor {
126
118
  tokio::task::spawn_blocking(
127
119
  move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
128
120
  let _guard = span.entered();
129
- let cursor = Cursor::new(&content_owned);
130
- let doc = docx_lite::parse_document(cursor)
131
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
121
+ let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
132
122
 
133
123
  let text = doc.extract_text();
134
124
 
@@ -147,9 +137,7 @@ impl DocumentExtractor for DocxExtractor {
147
137
  .await
148
138
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
149
139
  } else {
150
- let cursor = Cursor::new(content);
151
- let doc = docx_lite::parse_document(cursor)
152
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
140
+ let doc = crate::extraction::docx::parser::parse_document(content)?;
153
141
 
154
142
  let text = doc.extract_text();
155
143
 
@@ -373,7 +361,7 @@ mod tests {
373
361
 
374
362
  #[test]
375
363
  fn test_convert_docx_table_to_table() {
376
- use docx_lite::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
364
+ use crate::extraction::docx::parser::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
377
365
 
378
366
  let mut table = DocxTable::new();
379
367
 
@@ -152,6 +152,7 @@ impl DocumentExtractor for ImageExtractor {
152
152
  chunks: None,
153
153
  images: None,
154
154
  djot_content: None,
155
+ elements: None,
155
156
  });
156
157
  }
157
158
  }
@@ -181,10 +182,23 @@ impl DocumentExtractor for ImageExtractor {
181
182
  "image/png",
182
183
  "image/jpeg",
183
184
  "image/jpg",
185
+ "image/pjpeg",
184
186
  "image/webp",
185
187
  "image/bmp",
188
+ "image/x-bmp",
189
+ "image/x-ms-bmp",
186
190
  "image/tiff",
191
+ "image/x-tiff",
187
192
  "image/gif",
193
+ "image/jp2",
194
+ "image/jpx",
195
+ "image/jpm",
196
+ "image/mj2",
197
+ "image/x-jbig2",
198
+ "image/x-portable-anymap",
199
+ "image/x-portable-bitmap",
200
+ "image/x-portable-graymap",
201
+ "image/x-portable-pixmap",
188
202
  ]
189
203
  }
190
204
 
@@ -223,4 +237,15 @@ mod tests {
223
237
  let extractor = ImageExtractor;
224
238
  assert_eq!(extractor.name(), "image-extractor");
225
239
  }
240
+
241
+ #[test]
242
+ fn test_image_extractor_supports_alias_mime_types() {
243
+ let extractor = ImageExtractor::new();
244
+ let supported = extractor.supported_mime_types();
245
+ assert!(supported.contains(&"image/pjpeg"));
246
+ assert!(supported.contains(&"image/x-bmp"));
247
+ assert!(supported.contains(&"image/x-ms-bmp"));
248
+ assert!(supported.contains(&"image/x-tiff"));
249
+ assert!(supported.contains(&"image/x-portable-anymap"));
250
+ }
226
251
  }