RubyGems - kreuzberg - Versions diffs - 4.2.11 → 4.2.13 - Mend

kreuzberg 4.2.11 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +2 -2
data/vendor/kreuzberg/Cargo.toml +24 -9
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
data/vendor/kreuzberg/src/core/mime.rs +47 -2
data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
data/vendor/kreuzberg/src/extraction/{docx.rs → docx/mod.rs} +7 -17
data/vendor/kreuzberg/src/extraction/docx/parser.rs +686 -0
data/vendor/kreuzberg/src/extraction/image.rs +405 -18
data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
data/vendor/kreuzberg/src/extractors/docx.rs +10 -22
data/vendor/kreuzberg/src/extractors/image.rs +25 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
data/vendor/kreuzberg/src/extractors/security.rs +2 -1
data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
data/vendor/kreuzberg/src/extractors/text.rs +33 -4
data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
data/vendor/kreuzberg/tests/issue_359_list_whitespace_test.rs +33 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
metadata +7 -3

data/vendor/kreuzberg/src/extractors/citation.rs ADDED Viewed

@@ -0,0 +1,563 @@
+//! Citation format extractors for RIS, PubMed/MEDLINE, and EndNote XML.
+//!
+//! Extracts and parses citation files in various formats, providing structured access
+//! to bibliography entries, metadata, and author information.
+use crate::Result;
+use crate::core::config::ExtractionConfig;
+use crate::plugins::{DocumentExtractor, Plugin};
+use crate::types::{ExtractionResult, Metadata};
+use ahash::AHashMap;
+use async_trait::async_trait;
+use std::borrow::Cow;
+use std::collections::HashSet;
+#[cfg(feature = "office")]
+use biblib::{CitationParser, EndNoteXmlParser, PubMedParser, RisParser};
+/// Citation format extractor for RIS, PubMed/MEDLINE, and EndNote XML formats.
+///
+/// Parses citation files and extracts structured bibliography data including
+/// entries, authors, publication years, and format-specific metadata.
+pub struct CitationExtractor;
+impl CitationExtractor {
+    /// Create a new citation extractor.
+    pub fn new() -> Self {
+        Self
+    }
+}
+impl Default for CitationExtractor {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+impl Plugin for CitationExtractor {
+    fn name(&self) -> &str {
+        "citation-extractor"
+    }
+    fn version(&self) -> String {
+        env!("CARGO_PKG_VERSION").to_string()
+    }
+    fn initialize(&self) -> Result<()> {
+        Ok(())
+    }
+    fn shutdown(&self) -> Result<()> {
+        Ok(())
+    }
+    fn description(&self) -> &str {
+        "Extracts and parses citation files (RIS, PubMed/MEDLINE, EndNote XML) with structured metadata"
+    }
+    fn author(&self) -> &str {
+        "Kreuzberg Team"
+    }
+}
+#[cfg(feature = "office")]
+#[async_trait]
+impl DocumentExtractor for CitationExtractor {
+    #[cfg_attr(feature = "otel", tracing::instrument(
+        skip(self, content, _config),
+        fields(
+            extractor.name = self.name(),
+            content.size_bytes = content.len(),
+        )
+    ))]
+    async fn extract_bytes(
+        &self,
+        content: &[u8],
+        mime_type: &str,
+        _config: &ExtractionConfig,
+    ) -> Result<ExtractionResult> {
+        let citation_str = String::from_utf8_lossy(content);
+        let mut citations_vec = Vec::new();
+        let mut authors_set = HashSet::new();
+        let mut years_set = HashSet::new();
+        let mut dois_vec = Vec::new();
+        let mut keywords_set = HashSet::new();
+        let mut formatted_content = String::new();
+        // Parse based on MIME type
+        let (parse_result, format_string) = match mime_type {
+            "application/x-research-info-systems" => (RisParser::new().parse(&citation_str), "RIS"),
+            "application/x-pubmed" => (PubMedParser::new().parse(&citation_str), "PubMed"),
+            "application/x-endnote+xml" => (EndNoteXmlParser::new().parse(&citation_str), "EndNote XML"),
+            _ => {
+                // Fallback: return raw content if MIME type is unexpected
+                let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
+                additional.insert(Cow::Borrowed("citation_count"), serde_json::json!(0));
+                additional.insert(Cow::Borrowed("format"), serde_json::json!("Unknown"));
+                return Ok(ExtractionResult {
+                    content: citation_str.to_string(),
+                    mime_type: mime_type.to_string().into(),
+                    metadata: Metadata {
+                        additional,
+                        ..Default::default()
+                    },
+                    pages: None,
+                    tables: vec![],
+                    detected_languages: None,
+                    chunks: None,
+                    images: None,
+                    djot_content: None,
+                    elements: None,
+                });
+            }
+        };
+        match parse_result {
+            Ok(citations) => {
+                for citation in &citations {
+                    citations_vec.push(citation.title.clone());
+                    // Collect authors
+                    for author in &citation.authors {
+                        let author_name = if let Some(given) = &author.given_name {
+                            format!("{} {}", given, author.name)
+                        } else {
+                            author.name.clone()
+                        };
+                        if !author_name.is_empty() {
+                            authors_set.insert(author_name);
+                        }
+                    }
+                    // Collect years
+                    if let Some(date) = &citation.date {
+                        if date.year > 0 {
+                            years_set.insert(date.year as u32);
+                        }
+                    }
+                    // Collect DOIs
+                    if let Some(doi) = &citation.doi {
+                        if !doi.is_empty() {
+                            dois_vec.push(doi.clone());
+                        }
+                    }
+                    // Collect keywords
+                    for keyword in &citation.keywords {
+                        if !keyword.is_empty() {
+                            keywords_set.insert(keyword.clone());
+                        }
+                    }
+                    // Format citation as readable text
+                    if !citation.title.is_empty() {
+                        formatted_content.push_str(&format!("Title: {}\n", citation.title));
+                    }
+                    if !citation.authors.is_empty() {
+                        let author_strings: Vec<String> = citation
+                            .authors
+                            .iter()
+                            .map(|a| {
+                                if let Some(given) = &a.given_name {
+                                    format!("{} {}", given, a.name)
+                                } else {
+                                    a.name.clone()
+                                }
+                            })
+                            .collect();
+                        formatted_content.push_str(&format!("Authors: {}\n", author_strings.join(", ")));
+                    }
+                    if let Some(journal) = &citation.journal {
+                        formatted_content.push_str(&format!("Journal: {}\n", journal));
+                    }
+                    if let Some(date) = &citation.date {
+                        formatted_content.push_str(&format!("Year: {}\n", date.year));
+                    }
+                    if let Some(volume) = &citation.volume {
+                        formatted_content.push_str(&format!("Volume: {}", volume));
+                        if let Some(issue) = &citation.issue {
+                            formatted_content.push_str(&format!(", Issue: {}", issue));
+                        }
+                        if let Some(pages) = &citation.pages {
+                            formatted_content.push_str(&format!(", Pages: {}", pages));
+                        }
+                        formatted_content.push('\n');
+                    }
+                    if let Some(doi) = &citation.doi {
+                        formatted_content.push_str(&format!("DOI: {}\n", doi));
+                    }
+                    if let Some(pmid) = &citation.pmid {
+                        formatted_content.push_str(&format!("PMID: {}\n", pmid));
+                    }
+                    if let Some(abstract_text) = &citation.abstract_text {
+                        if !abstract_text.is_empty() {
+                            formatted_content.push_str(&format!("Abstract: {}\n", abstract_text));
+                        }
+                    }
+                    if !citation.keywords.is_empty() {
+                        formatted_content.push_str(&format!("Keywords: {}\n", citation.keywords.join(", ")));
+                    }
+                    formatted_content.push_str("---\n");
+                }
+            }
+            Err(_err) => {
+                #[cfg(feature = "otel")]
+                tracing::warn!("Citation parsing failed, returning raw content: {}", _err);
+                formatted_content = citation_str.to_string();
+            }
+        }
+        let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
+        additional.insert(Cow::Borrowed("citation_count"), serde_json::json!(citations_vec.len()));
+        let mut authors_list: Vec<String> = authors_set.into_iter().collect();
+        authors_list.sort();
+        additional.insert(Cow::Borrowed("authors"), serde_json::json!(authors_list));
+        if !years_set.is_empty() {
+            let min_year = years_set.iter().min().copied().unwrap_or(0);
+            let max_year = years_set.iter().max().copied().unwrap_or(0);
+            let mut years_sorted: Vec<u32> = years_set.into_iter().collect();
+            years_sorted.sort_unstable();
+            additional.insert(
+                Cow::Borrowed("year_range"),
+                serde_json::json!({
+                    "min": min_year,
+                    "max": max_year,
+                    "years": years_sorted
+                }),
+            );
+        }
+        if !dois_vec.is_empty() {
+            additional.insert(Cow::Borrowed("dois"), serde_json::json!(dois_vec));
+        }
+        let mut keywords_list: Vec<String> = keywords_set.into_iter().collect();
+        keywords_list.sort();
+        if !keywords_list.is_empty() {
+            additional.insert(Cow::Borrowed("keywords"), serde_json::json!(keywords_list));
+        }
+        additional.insert(Cow::Borrowed("format"), serde_json::json!(format_string));
+        Ok(ExtractionResult {
+            content: formatted_content,
+            mime_type: mime_type.to_string().into(),
+            metadata: Metadata {
+                additional,
+                ..Default::default()
+            },
+            pages: None,
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            djot_content: None,
+            elements: None,
+        })
+    }
+    fn supported_mime_types(&self) -> &[&str] {
+        &[
+            "application/x-research-info-systems",
+            "application/x-pubmed",
+            "application/x-endnote+xml",
+        ]
+    }
+    fn priority(&self) -> i32 {
+        60
+    }
+}
+#[cfg(all(test, feature = "office"))]
+mod tests {
+    use super::*;
+    #[tokio::test]
+    async fn test_can_extract_citation_mime_types() {
+        let extractor = CitationExtractor::new();
+        let supported = extractor.supported_mime_types();
+        assert!(supported.contains(&"application/x-research-info-systems"));
+        assert!(supported.contains(&"application/x-pubmed"));
+        assert!(supported.contains(&"application/x-endnote+xml"));
+        assert_eq!(supported.len(), 3);
+    }
+    #[tokio::test]
+    async fn test_extract_simple_ris() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - Sample Title
+AU  - Smith, John
+PY  - 2023
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract valid RIS entry");
+        assert!(result.content.contains("Sample Title"));
+        assert!(result.content.contains("Smith"));
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("citation_count")),
+            Some(&serde_json::json!(1))
+        );
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("format")),
+            Some(&serde_json::json!("RIS"))
+        );
+    }
+    #[tokio::test]
+    async fn test_extract_multiple_ris_entries() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - First Paper
+AU  - Author One
+PY  - 2020
+ER  -
+TY  - JOUR
+TI  - Second Paper
+AU  - Author Two
+PY  - 2021
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract multiple RIS entries");
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("citation_count")),
+            Some(&serde_json::json!(2))
+        );
+        if let Some(year_range) = metadata.additional.get("year_range") {
+            assert_eq!(year_range.get("min"), Some(&serde_json::json!(2020)));
+            assert_eq!(year_range.get("max"), Some(&serde_json::json!(2021)));
+        }
+    }
+    #[tokio::test]
+    async fn test_extract_ris_with_doi() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - Sample Article
+AU  - Smith, John
+DO  - 10.1234/example.doi
+PY  - 2023
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract RIS with DOI");
+        let metadata = &result.metadata;
+        if let Some(dois) = metadata.additional.get("dois") {
+            assert!(!dois.as_array().unwrap().is_empty());
+        }
+    }
+    #[tokio::test]
+    async fn test_extract_empty_citation_file() {
+        let extractor = CitationExtractor::new();
+        let empty_content = b"";
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(empty_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should handle empty citation file");
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("citation_count")),
+            Some(&serde_json::json!(0))
+        );
+    }
+    #[tokio::test]
+    async fn test_extract_malformed_ris() {
+        let extractor = CitationExtractor::new();
+        let malformed_content = b"This is not valid RIS format\nJust some random text";
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(malformed_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract malformed as raw content");
+        // When RIS parser encounters unparseable content, it may return empty results
+        // Verify we get a result either way
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("citation_count")),
+            Some(&serde_json::json!(0))
+        );
+    }
+    #[tokio::test]
+    async fn test_citation_extractor_plugin_interface() {
+        let extractor = CitationExtractor::new();
+        assert_eq!(extractor.name(), "citation-extractor");
+        assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
+        assert_eq!(extractor.priority(), 60);
+        assert!(!extractor.supported_mime_types().is_empty());
+    }
+    #[test]
+    fn test_citation_extractor_default() {
+        let extractor = CitationExtractor;
+        assert_eq!(extractor.name(), "citation-extractor");
+    }
+    #[tokio::test]
+    async fn test_citation_extractor_initialize_shutdown() {
+        let extractor = CitationExtractor::new();
+        assert!(extractor.initialize().is_ok());
+        assert!(extractor.shutdown().is_ok());
+    }
+    #[tokio::test]
+    async fn test_extract_ris_with_keywords() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - Sample Article
+AU  - Smith, John
+KW  - keyword1
+KW  - keyword2
+KW  - keyword3
+PY  - 2023
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract RIS with keywords");
+        let metadata = &result.metadata;
+        if let Some(keywords) = metadata.additional.get("keywords") {
+            assert!(!keywords.as_array().unwrap().is_empty());
+        }
+    }
+    #[tokio::test]
+    async fn test_extract_ris_with_multiple_authors() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - Collaborative Work
+AU  - First Author
+AU  - Second Author
+AU  - Third Author
+PY  - 2023
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract multiple authors");
+        let metadata = &result.metadata;
+        if let Some(authors) = metadata.additional.get("authors") {
+            assert!(!authors.as_array().unwrap().is_empty());
+        }
+    }
+    #[tokio::test]
+    async fn test_extract_pubmed_format() {
+        let extractor = CitationExtractor::new();
+        let pubmed_content = br#"PMID- 12345678
+TI  - Sample PubMed Article
+FAU - Smith, John
+DP  - 2023"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(pubmed_content, "application/x-pubmed", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract PubMed format");
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("format")),
+            Some(&serde_json::json!("PubMed"))
+        );
+    }
+    #[tokio::test]
+    async fn test_extract_endnote_xml_format() {
+        let extractor = CitationExtractor::new();
+        let endnote_content = br#"<?xml version="1.0" encoding="UTF-8"?>
+<xml>
+  <records>
+    <record>
+      <titles>
+        <title>Sample EndNote Article</title>
+      </titles>
+      <authors>
+        <author>Smith, John</author>
+      </authors>
+    </record>
+  </records>
+</xml>"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(endnote_content, "application/x-endnote+xml", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract EndNote XML format");
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("format")),
+            Some(&serde_json::json!("EndNote XML"))
+        );
+    }
+}

data/vendor/kreuzberg/src/extractors/docx.rs CHANGED Viewed

@@ -1,6 +1,6 @@
 #![cfg(all(feature = "tokio-runtime", feature = "office"))]
-//! DOCX extractor using docx-lite for high-performance text extraction.
+//! DOCX extractor for high-performance text extraction.
 //!
 //! Supports: Microsoft Word (.docx)
@@ -14,10 +14,10 @@ use async_trait::async_trait;
 use std::borrow::Cow;
 use std::io::Cursor;
-/// High-performance DOCX extractor using docx-lite.
+/// High-performance DOCX extractor.
 ///
 /// This extractor provides:
-/// - Fast text extraction via streaming XML parsing (~160 MB/s average)
+/// - Fast text extraction via streaming XML parsing
 /// - Comprehensive metadata extraction (core.xml, app.xml, custom.xml)
 pub struct DocxExtractor;
@@ -52,7 +52,7 @@ impl Plugin for DocxExtractor {
     }
     fn description(&self) -> &str {
-        "High-performance DOCX text extraction using docx-lite with metadata support"
+        "High-performance DOCX text extraction with metadata support"
     }
     fn author(&self) -> &str {
@@ -60,15 +60,15 @@ impl Plugin for DocxExtractor {
     }
 }
-/// Convert docx-lite table to Kreuzberg Table struct with markdown representation.
+/// Convert parsed DOCX table to Kreuzberg Table struct with markdown representation.
 ///
 /// # Arguments
-/// * `docx_table` - The table from docx-lite library
+/// * `docx_table` - The parsed DOCX table
 /// * `table_index` - Index of the table in the document (used as page_number)
 ///
 /// # Returns
 /// * `Table` - Converted table with cells and markdown representation
-fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize) -> Table {
+fn convert_docx_table_to_table(docx_table: &crate::extraction::docx::parser::Table, table_index: usize) -> Table {
     let cells: Vec<Vec<String>> = docx_table
         .rows
         .iter()
@@ -97,14 +97,6 @@ fn convert_docx_table_to_table(docx_table: &docx_lite::Table, table_index: usize
     }
 }
-/// Convert 2D cell data to markdown table format.
-///
-/// # Arguments
-/// * `cells` - 2D vector of cell strings (rows × columns)
-///
-/// # Returns
-/// * `String` - Markdown formatted table
 #[async_trait]
 impl DocumentExtractor for DocxExtractor {
     #[cfg_attr(feature = "otel", tracing::instrument(
@@ -126,9 +118,7 @@ impl DocumentExtractor for DocxExtractor {
             tokio::task::spawn_blocking(
                 move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
                     let _guard = span.entered();
-                    let cursor = Cursor::new(&content_owned);
-                    let doc = docx_lite::parse_document(cursor)
-                        .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
+                    let doc = crate::extraction::docx::parser::parse_document(&content_owned)?;
                     let text = doc.extract_text();
@@ -147,9 +137,7 @@ impl DocumentExtractor for DocxExtractor {
             .await
             .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
         } else {
-            let cursor = Cursor::new(content);
-            let doc = docx_lite::parse_document(cursor)
-                .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
+            let doc = crate::extraction::docx::parser::parse_document(content)?;
             let text = doc.extract_text();
@@ -373,7 +361,7 @@ mod tests {
     #[test]
     fn test_convert_docx_table_to_table() {
-        use docx_lite::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
+        use crate::extraction::docx::parser::{Paragraph, Run, Table as DocxTable, TableCell, TableRow};
         let mut table = DocxTable::new();

data/vendor/kreuzberg/src/extractors/image.rs CHANGED Viewed

@@ -152,6 +152,7 @@ impl DocumentExtractor for ImageExtractor {
                     chunks: None,
                     images: None,
                     djot_content: None,
+                    elements: None,
                 });
             }
         }
@@ -181,10 +182,23 @@ impl DocumentExtractor for ImageExtractor {
             "image/png",
             "image/jpeg",
             "image/jpg",
+            "image/pjpeg",
             "image/webp",
             "image/bmp",
+            "image/x-bmp",
+            "image/x-ms-bmp",
             "image/tiff",
+            "image/x-tiff",
             "image/gif",
+            "image/jp2",
+            "image/jpx",
+            "image/jpm",
+            "image/mj2",
+            "image/x-jbig2",
+            "image/x-portable-anymap",
+            "image/x-portable-bitmap",
+            "image/x-portable-graymap",
+            "image/x-portable-pixmap",
         ]
     }
@@ -223,4 +237,15 @@ mod tests {
         let extractor = ImageExtractor;
         assert_eq!(extractor.name(), "image-extractor");
     }
+    #[test]
+    fn test_image_extractor_supports_alias_mime_types() {
+        let extractor = ImageExtractor::new();
+        let supported = extractor.supported_mime_types();
+        assert!(supported.contains(&"image/pjpeg"));
+        assert!(supported.contains(&"image/x-bmp"));
+        assert!(supported.contains(&"image/x-ms-bmp"));
+        assert!(supported.contains(&"image/x-tiff"));
+        assert!(supported.contains(&"image/x-portable-anymap"));
+    }
 }