kreuzberg 4.2.12 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/lib/kreuzberg/version.rb +1 -1
  6. data/vendor/Cargo.toml +2 -2
  7. data/vendor/kreuzberg/Cargo.toml +24 -7
  8. data/vendor/kreuzberg/README.md +1 -1
  9. data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
  10. data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
  11. data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
  12. data/vendor/kreuzberg/src/core/mime.rs +47 -2
  13. data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
  14. data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
  15. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
  16. data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
  17. data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
  18. data/vendor/kreuzberg/src/extraction/image.rs +405 -18
  19. data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
  20. data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
  21. data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
  22. data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
  23. data/vendor/kreuzberg/src/extractors/image.rs +25 -0
  24. data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
  25. data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
  26. data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
  27. data/vendor/kreuzberg/src/extractors/security.rs +2 -1
  28. data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
  29. data/vendor/kreuzberg/src/extractors/text.rs +33 -4
  30. data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
  31. data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
  32. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  33. metadata +4 -2
@@ -0,0 +1,563 @@
1
+ //! Citation format extractors for RIS, PubMed/MEDLINE, and EndNote XML.
2
+ //!
3
+ //! Extracts and parses citation files in various formats, providing structured access
4
+ //! to bibliography entries, metadata, and author information.
5
+
6
+ use crate::Result;
7
+ use crate::core::config::ExtractionConfig;
8
+ use crate::plugins::{DocumentExtractor, Plugin};
9
+ use crate::types::{ExtractionResult, Metadata};
10
+ use ahash::AHashMap;
11
+ use async_trait::async_trait;
12
+ use std::borrow::Cow;
13
+ use std::collections::HashSet;
14
+
15
+ #[cfg(feature = "office")]
16
+ use biblib::{CitationParser, EndNoteXmlParser, PubMedParser, RisParser};
17
+
18
+ /// Citation format extractor for RIS, PubMed/MEDLINE, and EndNote XML formats.
19
+ ///
20
+ /// Parses citation files and extracts structured bibliography data including
21
+ /// entries, authors, publication years, and format-specific metadata.
22
+ pub struct CitationExtractor;
23
+
24
+ impl CitationExtractor {
25
+ /// Create a new citation extractor.
26
+ pub fn new() -> Self {
27
+ Self
28
+ }
29
+ }
30
+
31
+ impl Default for CitationExtractor {
32
+ fn default() -> Self {
33
+ Self::new()
34
+ }
35
+ }
36
+
37
+ impl Plugin for CitationExtractor {
38
+ fn name(&self) -> &str {
39
+ "citation-extractor"
40
+ }
41
+
42
+ fn version(&self) -> String {
43
+ env!("CARGO_PKG_VERSION").to_string()
44
+ }
45
+
46
+ fn initialize(&self) -> Result<()> {
47
+ Ok(())
48
+ }
49
+
50
+ fn shutdown(&self) -> Result<()> {
51
+ Ok(())
52
+ }
53
+
54
+ fn description(&self) -> &str {
55
+ "Extracts and parses citation files (RIS, PubMed/MEDLINE, EndNote XML) with structured metadata"
56
+ }
57
+
58
+ fn author(&self) -> &str {
59
+ "Kreuzberg Team"
60
+ }
61
+ }
62
+
63
+ #[cfg(feature = "office")]
64
+ #[async_trait]
65
+ impl DocumentExtractor for CitationExtractor {
66
+ #[cfg_attr(feature = "otel", tracing::instrument(
67
+ skip(self, content, _config),
68
+ fields(
69
+ extractor.name = self.name(),
70
+ content.size_bytes = content.len(),
71
+ )
72
+ ))]
73
+ async fn extract_bytes(
74
+ &self,
75
+ content: &[u8],
76
+ mime_type: &str,
77
+ _config: &ExtractionConfig,
78
+ ) -> Result<ExtractionResult> {
79
+ let citation_str = String::from_utf8_lossy(content);
80
+
81
+ let mut citations_vec = Vec::new();
82
+ let mut authors_set = HashSet::new();
83
+ let mut years_set = HashSet::new();
84
+ let mut dois_vec = Vec::new();
85
+ let mut keywords_set = HashSet::new();
86
+ let mut formatted_content = String::new();
87
+
88
+ // Parse based on MIME type
89
+ let (parse_result, format_string) = match mime_type {
90
+ "application/x-research-info-systems" => (RisParser::new().parse(&citation_str), "RIS"),
91
+ "application/x-pubmed" => (PubMedParser::new().parse(&citation_str), "PubMed"),
92
+ "application/x-endnote+xml" => (EndNoteXmlParser::new().parse(&citation_str), "EndNote XML"),
93
+ _ => {
94
+ // Fallback: return raw content if MIME type is unexpected
95
+ let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
96
+ additional.insert(Cow::Borrowed("citation_count"), serde_json::json!(0));
97
+ additional.insert(Cow::Borrowed("format"), serde_json::json!("Unknown"));
98
+
99
+ return Ok(ExtractionResult {
100
+ content: citation_str.to_string(),
101
+ mime_type: mime_type.to_string().into(),
102
+ metadata: Metadata {
103
+ additional,
104
+ ..Default::default()
105
+ },
106
+ pages: None,
107
+ tables: vec![],
108
+ detected_languages: None,
109
+ chunks: None,
110
+ images: None,
111
+ djot_content: None,
112
+ elements: None,
113
+ });
114
+ }
115
+ };
116
+
117
+ match parse_result {
118
+ Ok(citations) => {
119
+ for citation in &citations {
120
+ citations_vec.push(citation.title.clone());
121
+
122
+ // Collect authors
123
+ for author in &citation.authors {
124
+ let author_name = if let Some(given) = &author.given_name {
125
+ format!("{} {}", given, author.name)
126
+ } else {
127
+ author.name.clone()
128
+ };
129
+ if !author_name.is_empty() {
130
+ authors_set.insert(author_name);
131
+ }
132
+ }
133
+
134
+ // Collect years
135
+ if let Some(date) = &citation.date {
136
+ if date.year > 0 {
137
+ years_set.insert(date.year as u32);
138
+ }
139
+ }
140
+
141
+ // Collect DOIs
142
+ if let Some(doi) = &citation.doi {
143
+ if !doi.is_empty() {
144
+ dois_vec.push(doi.clone());
145
+ }
146
+ }
147
+
148
+ // Collect keywords
149
+ for keyword in &citation.keywords {
150
+ if !keyword.is_empty() {
151
+ keywords_set.insert(keyword.clone());
152
+ }
153
+ }
154
+
155
+ // Format citation as readable text
156
+ if !citation.title.is_empty() {
157
+ formatted_content.push_str(&format!("Title: {}\n", citation.title));
158
+ }
159
+
160
+ if !citation.authors.is_empty() {
161
+ let author_strings: Vec<String> = citation
162
+ .authors
163
+ .iter()
164
+ .map(|a| {
165
+ if let Some(given) = &a.given_name {
166
+ format!("{} {}", given, a.name)
167
+ } else {
168
+ a.name.clone()
169
+ }
170
+ })
171
+ .collect();
172
+ formatted_content.push_str(&format!("Authors: {}\n", author_strings.join(", ")));
173
+ }
174
+
175
+ if let Some(journal) = &citation.journal {
176
+ formatted_content.push_str(&format!("Journal: {}\n", journal));
177
+ }
178
+
179
+ if let Some(date) = &citation.date {
180
+ formatted_content.push_str(&format!("Year: {}\n", date.year));
181
+ }
182
+
183
+ if let Some(volume) = &citation.volume {
184
+ formatted_content.push_str(&format!("Volume: {}", volume));
185
+ if let Some(issue) = &citation.issue {
186
+ formatted_content.push_str(&format!(", Issue: {}", issue));
187
+ }
188
+ if let Some(pages) = &citation.pages {
189
+ formatted_content.push_str(&format!(", Pages: {}", pages));
190
+ }
191
+ formatted_content.push('\n');
192
+ }
193
+
194
+ if let Some(doi) = &citation.doi {
195
+ formatted_content.push_str(&format!("DOI: {}\n", doi));
196
+ }
197
+
198
+ if let Some(pmid) = &citation.pmid {
199
+ formatted_content.push_str(&format!("PMID: {}\n", pmid));
200
+ }
201
+
202
+ if let Some(abstract_text) = &citation.abstract_text {
203
+ if !abstract_text.is_empty() {
204
+ formatted_content.push_str(&format!("Abstract: {}\n", abstract_text));
205
+ }
206
+ }
207
+
208
+ if !citation.keywords.is_empty() {
209
+ formatted_content.push_str(&format!("Keywords: {}\n", citation.keywords.join(", ")));
210
+ }
211
+
212
+ formatted_content.push_str("---\n");
213
+ }
214
+ }
215
+ Err(_err) => {
216
+ #[cfg(feature = "otel")]
217
+ tracing::warn!("Citation parsing failed, returning raw content: {}", _err);
218
+ formatted_content = citation_str.to_string();
219
+ }
220
+ }
221
+
222
+ let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
223
+
224
+ additional.insert(Cow::Borrowed("citation_count"), serde_json::json!(citations_vec.len()));
225
+
226
+ let mut authors_list: Vec<String> = authors_set.into_iter().collect();
227
+ authors_list.sort();
228
+ additional.insert(Cow::Borrowed("authors"), serde_json::json!(authors_list));
229
+
230
+ if !years_set.is_empty() {
231
+ let min_year = years_set.iter().min().copied().unwrap_or(0);
232
+ let max_year = years_set.iter().max().copied().unwrap_or(0);
233
+ let mut years_sorted: Vec<u32> = years_set.into_iter().collect();
234
+ years_sorted.sort_unstable();
235
+ additional.insert(
236
+ Cow::Borrowed("year_range"),
237
+ serde_json::json!({
238
+ "min": min_year,
239
+ "max": max_year,
240
+ "years": years_sorted
241
+ }),
242
+ );
243
+ }
244
+
245
+ if !dois_vec.is_empty() {
246
+ additional.insert(Cow::Borrowed("dois"), serde_json::json!(dois_vec));
247
+ }
248
+
249
+ let mut keywords_list: Vec<String> = keywords_set.into_iter().collect();
250
+ keywords_list.sort();
251
+ if !keywords_list.is_empty() {
252
+ additional.insert(Cow::Borrowed("keywords"), serde_json::json!(keywords_list));
253
+ }
254
+
255
+ additional.insert(Cow::Borrowed("format"), serde_json::json!(format_string));
256
+
257
+ Ok(ExtractionResult {
258
+ content: formatted_content,
259
+ mime_type: mime_type.to_string().into(),
260
+ metadata: Metadata {
261
+ additional,
262
+ ..Default::default()
263
+ },
264
+ pages: None,
265
+ tables: vec![],
266
+ detected_languages: None,
267
+ chunks: None,
268
+ images: None,
269
+ djot_content: None,
270
+ elements: None,
271
+ })
272
+ }
273
+
274
+ fn supported_mime_types(&self) -> &[&str] {
275
+ &[
276
+ "application/x-research-info-systems",
277
+ "application/x-pubmed",
278
+ "application/x-endnote+xml",
279
+ ]
280
+ }
281
+
282
+ fn priority(&self) -> i32 {
283
+ 60
284
+ }
285
+ }
286
+
287
+ #[cfg(all(test, feature = "office"))]
288
+ mod tests {
289
+ use super::*;
290
+
291
+ #[tokio::test]
292
+ async fn test_can_extract_citation_mime_types() {
293
+ let extractor = CitationExtractor::new();
294
+ let supported = extractor.supported_mime_types();
295
+
296
+ assert!(supported.contains(&"application/x-research-info-systems"));
297
+ assert!(supported.contains(&"application/x-pubmed"));
298
+ assert!(supported.contains(&"application/x-endnote+xml"));
299
+ assert_eq!(supported.len(), 3);
300
+ }
301
+
302
+ #[tokio::test]
303
+ async fn test_extract_simple_ris() {
304
+ let extractor = CitationExtractor::new();
305
+ let ris_content = br#"TY - JOUR
306
+ TI - Sample Title
307
+ AU - Smith, John
308
+ PY - 2023
309
+ ER -"#;
310
+
311
+ let config = ExtractionConfig::default();
312
+ let result = extractor
313
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
314
+ .await;
315
+
316
+ assert!(result.is_ok());
317
+ let result = result.expect("Should extract valid RIS entry");
318
+
319
+ assert!(result.content.contains("Sample Title"));
320
+ assert!(result.content.contains("Smith"));
321
+
322
+ let metadata = &result.metadata;
323
+ assert_eq!(
324
+ metadata.additional.get(&Cow::Borrowed("citation_count")),
325
+ Some(&serde_json::json!(1))
326
+ );
327
+ assert_eq!(
328
+ metadata.additional.get(&Cow::Borrowed("format")),
329
+ Some(&serde_json::json!("RIS"))
330
+ );
331
+ }
332
+
333
+ #[tokio::test]
334
+ async fn test_extract_multiple_ris_entries() {
335
+ let extractor = CitationExtractor::new();
336
+ let ris_content = br#"TY - JOUR
337
+ TI - First Paper
338
+ AU - Author One
339
+ PY - 2020
340
+ ER -
341
+
342
+ TY - JOUR
343
+ TI - Second Paper
344
+ AU - Author Two
345
+ PY - 2021
346
+ ER -"#;
347
+
348
+ let config = ExtractionConfig::default();
349
+ let result = extractor
350
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
351
+ .await;
352
+
353
+ assert!(result.is_ok());
354
+ let result = result.expect("Should extract multiple RIS entries");
355
+
356
+ let metadata = &result.metadata;
357
+
358
+ assert_eq!(
359
+ metadata.additional.get(&Cow::Borrowed("citation_count")),
360
+ Some(&serde_json::json!(2))
361
+ );
362
+
363
+ if let Some(year_range) = metadata.additional.get("year_range") {
364
+ assert_eq!(year_range.get("min"), Some(&serde_json::json!(2020)));
365
+ assert_eq!(year_range.get("max"), Some(&serde_json::json!(2021)));
366
+ }
367
+ }
368
+
369
+ #[tokio::test]
370
+ async fn test_extract_ris_with_doi() {
371
+ let extractor = CitationExtractor::new();
372
+ let ris_content = br#"TY - JOUR
373
+ TI - Sample Article
374
+ AU - Smith, John
375
+ DO - 10.1234/example.doi
376
+ PY - 2023
377
+ ER -"#;
378
+
379
+ let config = ExtractionConfig::default();
380
+ let result = extractor
381
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
382
+ .await;
383
+
384
+ assert!(result.is_ok());
385
+ let result = result.expect("Should extract RIS with DOI");
386
+
387
+ let metadata = &result.metadata;
388
+ if let Some(dois) = metadata.additional.get("dois") {
389
+ assert!(!dois.as_array().unwrap().is_empty());
390
+ }
391
+ }
392
+
393
+ #[tokio::test]
394
+ async fn test_extract_empty_citation_file() {
395
+ let extractor = CitationExtractor::new();
396
+ let empty_content = b"";
397
+
398
+ let config = ExtractionConfig::default();
399
+ let result = extractor
400
+ .extract_bytes(empty_content, "application/x-research-info-systems", &config)
401
+ .await;
402
+
403
+ assert!(result.is_ok());
404
+ let result = result.expect("Should handle empty citation file");
405
+
406
+ let metadata = &result.metadata;
407
+
408
+ assert_eq!(
409
+ metadata.additional.get(&Cow::Borrowed("citation_count")),
410
+ Some(&serde_json::json!(0))
411
+ );
412
+ }
413
+
414
+ #[tokio::test]
415
+ async fn test_extract_malformed_ris() {
416
+ let extractor = CitationExtractor::new();
417
+ let malformed_content = b"This is not valid RIS format\nJust some random text";
418
+
419
+ let config = ExtractionConfig::default();
420
+ let result = extractor
421
+ .extract_bytes(malformed_content, "application/x-research-info-systems", &config)
422
+ .await;
423
+
424
+ assert!(result.is_ok());
425
+ let result = result.expect("Should extract malformed as raw content");
426
+
427
+ // When RIS parser encounters unparseable content, it may return empty results
428
+ // Verify we get a result either way
429
+ let metadata = &result.metadata;
430
+ assert_eq!(
431
+ metadata.additional.get(&Cow::Borrowed("citation_count")),
432
+ Some(&serde_json::json!(0))
433
+ );
434
+ }
435
+
436
+ #[tokio::test]
437
+ async fn test_citation_extractor_plugin_interface() {
438
+ let extractor = CitationExtractor::new();
439
+ assert_eq!(extractor.name(), "citation-extractor");
440
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
441
+ assert_eq!(extractor.priority(), 60);
442
+ assert!(!extractor.supported_mime_types().is_empty());
443
+ }
444
+
445
+ #[test]
446
+ fn test_citation_extractor_default() {
447
+ let extractor = CitationExtractor;
448
+ assert_eq!(extractor.name(), "citation-extractor");
449
+ }
450
+
451
+ #[tokio::test]
452
+ async fn test_citation_extractor_initialize_shutdown() {
453
+ let extractor = CitationExtractor::new();
454
+ assert!(extractor.initialize().is_ok());
455
+ assert!(extractor.shutdown().is_ok());
456
+ }
457
+
458
+ #[tokio::test]
459
+ async fn test_extract_ris_with_keywords() {
460
+ let extractor = CitationExtractor::new();
461
+ let ris_content = br#"TY - JOUR
462
+ TI - Sample Article
463
+ AU - Smith, John
464
+ KW - keyword1
465
+ KW - keyword2
466
+ KW - keyword3
467
+ PY - 2023
468
+ ER -"#;
469
+
470
+ let config = ExtractionConfig::default();
471
+ let result = extractor
472
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
473
+ .await;
474
+
475
+ assert!(result.is_ok());
476
+ let result = result.expect("Should extract RIS with keywords");
477
+
478
+ let metadata = &result.metadata;
479
+ if let Some(keywords) = metadata.additional.get("keywords") {
480
+ assert!(!keywords.as_array().unwrap().is_empty());
481
+ }
482
+ }
483
+
484
+ #[tokio::test]
485
+ async fn test_extract_ris_with_multiple_authors() {
486
+ let extractor = CitationExtractor::new();
487
+ let ris_content = br#"TY - JOUR
488
+ TI - Collaborative Work
489
+ AU - First Author
490
+ AU - Second Author
491
+ AU - Third Author
492
+ PY - 2023
493
+ ER -"#;
494
+
495
+ let config = ExtractionConfig::default();
496
+ let result = extractor
497
+ .extract_bytes(ris_content, "application/x-research-info-systems", &config)
498
+ .await;
499
+
500
+ assert!(result.is_ok());
501
+ let result = result.expect("Should extract multiple authors");
502
+
503
+ let metadata = &result.metadata;
504
+ if let Some(authors) = metadata.additional.get("authors") {
505
+ assert!(!authors.as_array().unwrap().is_empty());
506
+ }
507
+ }
508
+
509
+ #[tokio::test]
510
+ async fn test_extract_pubmed_format() {
511
+ let extractor = CitationExtractor::new();
512
+ let pubmed_content = br#"PMID- 12345678
513
+ TI - Sample PubMed Article
514
+ FAU - Smith, John
515
+ DP - 2023"#;
516
+
517
+ let config = ExtractionConfig::default();
518
+ let result = extractor
519
+ .extract_bytes(pubmed_content, "application/x-pubmed", &config)
520
+ .await;
521
+
522
+ assert!(result.is_ok());
523
+ let result = result.expect("Should extract PubMed format");
524
+
525
+ let metadata = &result.metadata;
526
+ assert_eq!(
527
+ metadata.additional.get(&Cow::Borrowed("format")),
528
+ Some(&serde_json::json!("PubMed"))
529
+ );
530
+ }
531
+
532
+ #[tokio::test]
533
+ async fn test_extract_endnote_xml_format() {
534
+ let extractor = CitationExtractor::new();
535
+ let endnote_content = br#"<?xml version="1.0" encoding="UTF-8"?>
536
+ <xml>
537
+ <records>
538
+ <record>
539
+ <titles>
540
+ <title>Sample EndNote Article</title>
541
+ </titles>
542
+ <authors>
543
+ <author>Smith, John</author>
544
+ </authors>
545
+ </record>
546
+ </records>
547
+ </xml>"#;
548
+
549
+ let config = ExtractionConfig::default();
550
+ let result = extractor
551
+ .extract_bytes(endnote_content, "application/x-endnote+xml", &config)
552
+ .await;
553
+
554
+ assert!(result.is_ok());
555
+ let result = result.expect("Should extract EndNote XML format");
556
+
557
+ let metadata = &result.metadata;
558
+ assert_eq!(
559
+ metadata.additional.get(&Cow::Borrowed("format")),
560
+ Some(&serde_json::json!("EndNote XML"))
561
+ );
562
+ }
563
+ }
@@ -152,6 +152,7 @@ impl DocumentExtractor for ImageExtractor {
152
152
  chunks: None,
153
153
  images: None,
154
154
  djot_content: None,
155
+ elements: None,
155
156
  });
156
157
  }
157
158
  }
@@ -181,10 +182,23 @@ impl DocumentExtractor for ImageExtractor {
181
182
  "image/png",
182
183
  "image/jpeg",
183
184
  "image/jpg",
185
+ "image/pjpeg",
184
186
  "image/webp",
185
187
  "image/bmp",
188
+ "image/x-bmp",
189
+ "image/x-ms-bmp",
186
190
  "image/tiff",
191
+ "image/x-tiff",
187
192
  "image/gif",
193
+ "image/jp2",
194
+ "image/jpx",
195
+ "image/jpm",
196
+ "image/mj2",
197
+ "image/x-jbig2",
198
+ "image/x-portable-anymap",
199
+ "image/x-portable-bitmap",
200
+ "image/x-portable-graymap",
201
+ "image/x-portable-pixmap",
188
202
  ]
189
203
  }
190
204
 
@@ -223,4 +237,15 @@ mod tests {
223
237
  let extractor = ImageExtractor;
224
238
  assert_eq!(extractor.name(), "image-extractor");
225
239
  }
240
+
241
+ #[test]
242
+ fn test_image_extractor_supports_alias_mime_types() {
243
+ let extractor = ImageExtractor::new();
244
+ let supported = extractor.supported_mime_types();
245
+ assert!(supported.contains(&"image/pjpeg"));
246
+ assert!(supported.contains(&"image/x-bmp"));
247
+ assert!(supported.contains(&"image/x-ms-bmp"));
248
+ assert!(supported.contains(&"image/x-tiff"));
249
+ assert!(supported.contains(&"image/x-portable-anymap"));
250
+ }
226
251
  }
@@ -238,7 +238,14 @@ impl DocumentExtractor for MarkdownExtractor {
238
238
  }
239
239
 
240
240
  fn supported_mime_types(&self) -> &[&str] {
241
- &["text/markdown", "text/x-markdown", "text/x-gfm", "text/x-commonmark"]
241
+ &[
242
+ "text/markdown",
243
+ "text/x-markdown",
244
+ "text/x-gfm",
245
+ "text/x-commonmark",
246
+ "text/x-markdown-extra",
247
+ "text/x-multimarkdown",
248
+ ]
242
249
  }
243
250
 
244
251
  fn priority(&self) -> i32 {
@@ -261,6 +268,8 @@ mod tests {
261
268
  assert!(mime_types.contains(&"text/x-markdown"));
262
269
  assert!(mime_types.contains(&"text/x-gfm"));
263
270
  assert!(mime_types.contains(&"text/x-commonmark"));
271
+ assert!(mime_types.contains(&"text/x-markdown-extra"));
272
+ assert!(mime_types.contains(&"text/x-multimarkdown"));
264
273
  }
265
274
 
266
275
  #[test]