RubyGems - kreuzberg - Versions diffs - 4.2.12 → 4.2.13 - Mend

kreuzberg 4.2.12 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +2 -2
data/vendor/kreuzberg/Cargo.toml +24 -7
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
data/vendor/kreuzberg/src/core/mime.rs +47 -2
data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
data/vendor/kreuzberg/src/extraction/image.rs +405 -18
data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
data/vendor/kreuzberg/src/extractors/image.rs +25 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
data/vendor/kreuzberg/src/extractors/security.rs +2 -1
data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
data/vendor/kreuzberg/src/extractors/text.rs +33 -4
data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +4 -2

data/vendor/kreuzberg/src/extractors/citation.rs ADDED Viewed

@@ -0,0 +1,563 @@
+//! Citation format extractors for RIS, PubMed/MEDLINE, and EndNote XML.
+//!
+//! Extracts and parses citation files in various formats, providing structured access
+//! to bibliography entries, metadata, and author information.
+use crate::Result;
+use crate::core::config::ExtractionConfig;
+use crate::plugins::{DocumentExtractor, Plugin};
+use crate::types::{ExtractionResult, Metadata};
+use ahash::AHashMap;
+use async_trait::async_trait;
+use std::borrow::Cow;
+use std::collections::HashSet;
+#[cfg(feature = "office")]
+use biblib::{CitationParser, EndNoteXmlParser, PubMedParser, RisParser};
+/// Citation format extractor for RIS, PubMed/MEDLINE, and EndNote XML formats.
+///
+/// Parses citation files and extracts structured bibliography data including
+/// entries, authors, publication years, and format-specific metadata.
+pub struct CitationExtractor;
+impl CitationExtractor {
+    /// Create a new citation extractor.
+    pub fn new() -> Self {
+        Self
+    }
+}
+impl Default for CitationExtractor {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+impl Plugin for CitationExtractor {
+    fn name(&self) -> &str {
+        "citation-extractor"
+    }
+    fn version(&self) -> String {
+        env!("CARGO_PKG_VERSION").to_string()
+    }
+    fn initialize(&self) -> Result<()> {
+        Ok(())
+    }
+    fn shutdown(&self) -> Result<()> {
+        Ok(())
+    }
+    fn description(&self) -> &str {
+        "Extracts and parses citation files (RIS, PubMed/MEDLINE, EndNote XML) with structured metadata"
+    }
+    fn author(&self) -> &str {
+        "Kreuzberg Team"
+    }
+}
+#[cfg(feature = "office")]
+#[async_trait]
+impl DocumentExtractor for CitationExtractor {
+    #[cfg_attr(feature = "otel", tracing::instrument(
+        skip(self, content, _config),
+        fields(
+            extractor.name = self.name(),
+            content.size_bytes = content.len(),
+        )
+    ))]
+    async fn extract_bytes(
+        &self,
+        content: &[u8],
+        mime_type: &str,
+        _config: &ExtractionConfig,
+    ) -> Result<ExtractionResult> {
+        let citation_str = String::from_utf8_lossy(content);
+        let mut citations_vec = Vec::new();
+        let mut authors_set = HashSet::new();
+        let mut years_set = HashSet::new();
+        let mut dois_vec = Vec::new();
+        let mut keywords_set = HashSet::new();
+        let mut formatted_content = String::new();
+        // Parse based on MIME type
+        let (parse_result, format_string) = match mime_type {
+            "application/x-research-info-systems" => (RisParser::new().parse(&citation_str), "RIS"),
+            "application/x-pubmed" => (PubMedParser::new().parse(&citation_str), "PubMed"),
+            "application/x-endnote+xml" => (EndNoteXmlParser::new().parse(&citation_str), "EndNote XML"),
+            _ => {
+                // Fallback: return raw content if MIME type is unexpected
+                let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
+                additional.insert(Cow::Borrowed("citation_count"), serde_json::json!(0));
+                additional.insert(Cow::Borrowed("format"), serde_json::json!("Unknown"));
+                return Ok(ExtractionResult {
+                    content: citation_str.to_string(),
+                    mime_type: mime_type.to_string().into(),
+                    metadata: Metadata {
+                        additional,
+                        ..Default::default()
+                    },
+                    pages: None,
+                    tables: vec![],
+                    detected_languages: None,
+                    chunks: None,
+                    images: None,
+                    djot_content: None,
+                    elements: None,
+                });
+            }
+        };
+        match parse_result {
+            Ok(citations) => {
+                for citation in &citations {
+                    citations_vec.push(citation.title.clone());
+                    // Collect authors
+                    for author in &citation.authors {
+                        let author_name = if let Some(given) = &author.given_name {
+                            format!("{} {}", given, author.name)
+                        } else {
+                            author.name.clone()
+                        };
+                        if !author_name.is_empty() {
+                            authors_set.insert(author_name);
+                        }
+                    }
+                    // Collect years
+                    if let Some(date) = &citation.date {
+                        if date.year > 0 {
+                            years_set.insert(date.year as u32);
+                        }
+                    }
+                    // Collect DOIs
+                    if let Some(doi) = &citation.doi {
+                        if !doi.is_empty() {
+                            dois_vec.push(doi.clone());
+                        }
+                    }
+                    // Collect keywords
+                    for keyword in &citation.keywords {
+                        if !keyword.is_empty() {
+                            keywords_set.insert(keyword.clone());
+                        }
+                    }
+                    // Format citation as readable text
+                    if !citation.title.is_empty() {
+                        formatted_content.push_str(&format!("Title: {}\n", citation.title));
+                    }
+                    if !citation.authors.is_empty() {
+                        let author_strings: Vec<String> = citation
+                            .authors
+                            .iter()
+                            .map(|a| {
+                                if let Some(given) = &a.given_name {
+                                    format!("{} {}", given, a.name)
+                                } else {
+                                    a.name.clone()
+                                }
+                            })
+                            .collect();
+                        formatted_content.push_str(&format!("Authors: {}\n", author_strings.join(", ")));
+                    }
+                    if let Some(journal) = &citation.journal {
+                        formatted_content.push_str(&format!("Journal: {}\n", journal));
+                    }
+                    if let Some(date) = &citation.date {
+                        formatted_content.push_str(&format!("Year: {}\n", date.year));
+                    }
+                    if let Some(volume) = &citation.volume {
+                        formatted_content.push_str(&format!("Volume: {}", volume));
+                        if let Some(issue) = &citation.issue {
+                            formatted_content.push_str(&format!(", Issue: {}", issue));
+                        }
+                        if let Some(pages) = &citation.pages {
+                            formatted_content.push_str(&format!(", Pages: {}", pages));
+                        }
+                        formatted_content.push('\n');
+                    }
+                    if let Some(doi) = &citation.doi {
+                        formatted_content.push_str(&format!("DOI: {}\n", doi));
+                    }
+                    if let Some(pmid) = &citation.pmid {
+                        formatted_content.push_str(&format!("PMID: {}\n", pmid));
+                    }
+                    if let Some(abstract_text) = &citation.abstract_text {
+                        if !abstract_text.is_empty() {
+                            formatted_content.push_str(&format!("Abstract: {}\n", abstract_text));
+                        }
+                    }
+                    if !citation.keywords.is_empty() {
+                        formatted_content.push_str(&format!("Keywords: {}\n", citation.keywords.join(", ")));
+                    }
+                    formatted_content.push_str("---\n");
+                }
+            }
+            Err(_err) => {
+                #[cfg(feature = "otel")]
+                tracing::warn!("Citation parsing failed, returning raw content: {}", _err);
+                formatted_content = citation_str.to_string();
+            }
+        }
+        let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
+        additional.insert(Cow::Borrowed("citation_count"), serde_json::json!(citations_vec.len()));
+        let mut authors_list: Vec<String> = authors_set.into_iter().collect();
+        authors_list.sort();
+        additional.insert(Cow::Borrowed("authors"), serde_json::json!(authors_list));
+        if !years_set.is_empty() {
+            let min_year = years_set.iter().min().copied().unwrap_or(0);
+            let max_year = years_set.iter().max().copied().unwrap_or(0);
+            let mut years_sorted: Vec<u32> = years_set.into_iter().collect();
+            years_sorted.sort_unstable();
+            additional.insert(
+                Cow::Borrowed("year_range"),
+                serde_json::json!({
+                    "min": min_year,
+                    "max": max_year,
+                    "years": years_sorted
+                }),
+            );
+        }
+        if !dois_vec.is_empty() {
+            additional.insert(Cow::Borrowed("dois"), serde_json::json!(dois_vec));
+        }
+        let mut keywords_list: Vec<String> = keywords_set.into_iter().collect();
+        keywords_list.sort();
+        if !keywords_list.is_empty() {
+            additional.insert(Cow::Borrowed("keywords"), serde_json::json!(keywords_list));
+        }
+        additional.insert(Cow::Borrowed("format"), serde_json::json!(format_string));
+        Ok(ExtractionResult {
+            content: formatted_content,
+            mime_type: mime_type.to_string().into(),
+            metadata: Metadata {
+                additional,
+                ..Default::default()
+            },
+            pages: None,
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            djot_content: None,
+            elements: None,
+        })
+    }
+    fn supported_mime_types(&self) -> &[&str] {
+        &[
+            "application/x-research-info-systems",
+            "application/x-pubmed",
+            "application/x-endnote+xml",
+        ]
+    }
+    fn priority(&self) -> i32 {
+        60
+    }
+}
+#[cfg(all(test, feature = "office"))]
+mod tests {
+    use super::*;
+    #[tokio::test]
+    async fn test_can_extract_citation_mime_types() {
+        let extractor = CitationExtractor::new();
+        let supported = extractor.supported_mime_types();
+        assert!(supported.contains(&"application/x-research-info-systems"));
+        assert!(supported.contains(&"application/x-pubmed"));
+        assert!(supported.contains(&"application/x-endnote+xml"));
+        assert_eq!(supported.len(), 3);
+    }
+    #[tokio::test]
+    async fn test_extract_simple_ris() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - Sample Title
+AU  - Smith, John
+PY  - 2023
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract valid RIS entry");
+        assert!(result.content.contains("Sample Title"));
+        assert!(result.content.contains("Smith"));
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("citation_count")),
+            Some(&serde_json::json!(1))
+        );
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("format")),
+            Some(&serde_json::json!("RIS"))
+        );
+    }
+    #[tokio::test]
+    async fn test_extract_multiple_ris_entries() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - First Paper
+AU  - Author One
+PY  - 2020
+ER  -
+TY  - JOUR
+TI  - Second Paper
+AU  - Author Two
+PY  - 2021
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract multiple RIS entries");
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("citation_count")),
+            Some(&serde_json::json!(2))
+        );
+        if let Some(year_range) = metadata.additional.get("year_range") {
+            assert_eq!(year_range.get("min"), Some(&serde_json::json!(2020)));
+            assert_eq!(year_range.get("max"), Some(&serde_json::json!(2021)));
+        }
+    }
+    #[tokio::test]
+    async fn test_extract_ris_with_doi() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - Sample Article
+AU  - Smith, John
+DO  - 10.1234/example.doi
+PY  - 2023
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract RIS with DOI");
+        let metadata = &result.metadata;
+        if let Some(dois) = metadata.additional.get("dois") {
+            assert!(!dois.as_array().unwrap().is_empty());
+        }
+    }
+    #[tokio::test]
+    async fn test_extract_empty_citation_file() {
+        let extractor = CitationExtractor::new();
+        let empty_content = b"";
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(empty_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should handle empty citation file");
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("citation_count")),
+            Some(&serde_json::json!(0))
+        );
+    }
+    #[tokio::test]
+    async fn test_extract_malformed_ris() {
+        let extractor = CitationExtractor::new();
+        let malformed_content = b"This is not valid RIS format\nJust some random text";
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(malformed_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract malformed as raw content");
+        // When RIS parser encounters unparseable content, it may return empty results
+        // Verify we get a result either way
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("citation_count")),
+            Some(&serde_json::json!(0))
+        );
+    }
+    #[tokio::test]
+    async fn test_citation_extractor_plugin_interface() {
+        let extractor = CitationExtractor::new();
+        assert_eq!(extractor.name(), "citation-extractor");
+        assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
+        assert_eq!(extractor.priority(), 60);
+        assert!(!extractor.supported_mime_types().is_empty());
+    }
+    #[test]
+    fn test_citation_extractor_default() {
+        let extractor = CitationExtractor;
+        assert_eq!(extractor.name(), "citation-extractor");
+    }
+    #[tokio::test]
+    async fn test_citation_extractor_initialize_shutdown() {
+        let extractor = CitationExtractor::new();
+        assert!(extractor.initialize().is_ok());
+        assert!(extractor.shutdown().is_ok());
+    }
+    #[tokio::test]
+    async fn test_extract_ris_with_keywords() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - Sample Article
+AU  - Smith, John
+KW  - keyword1
+KW  - keyword2
+KW  - keyword3
+PY  - 2023
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract RIS with keywords");
+        let metadata = &result.metadata;
+        if let Some(keywords) = metadata.additional.get("keywords") {
+            assert!(!keywords.as_array().unwrap().is_empty());
+        }
+    }
+    #[tokio::test]
+    async fn test_extract_ris_with_multiple_authors() {
+        let extractor = CitationExtractor::new();
+        let ris_content = br#"TY  - JOUR
+TI  - Collaborative Work
+AU  - First Author
+AU  - Second Author
+AU  - Third Author
+PY  - 2023
+ER  -"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(ris_content, "application/x-research-info-systems", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract multiple authors");
+        let metadata = &result.metadata;
+        if let Some(authors) = metadata.additional.get("authors") {
+            assert!(!authors.as_array().unwrap().is_empty());
+        }
+    }
+    #[tokio::test]
+    async fn test_extract_pubmed_format() {
+        let extractor = CitationExtractor::new();
+        let pubmed_content = br#"PMID- 12345678
+TI  - Sample PubMed Article
+FAU - Smith, John
+DP  - 2023"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(pubmed_content, "application/x-pubmed", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract PubMed format");
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("format")),
+            Some(&serde_json::json!("PubMed"))
+        );
+    }
+    #[tokio::test]
+    async fn test_extract_endnote_xml_format() {
+        let extractor = CitationExtractor::new();
+        let endnote_content = br#"<?xml version="1.0" encoding="UTF-8"?>
+<xml>
+  <records>
+    <record>
+      <titles>
+        <title>Sample EndNote Article</title>
+      </titles>
+      <authors>
+        <author>Smith, John</author>
+      </authors>
+    </record>
+  </records>
+</xml>"#;
+        let config = ExtractionConfig::default();
+        let result = extractor
+            .extract_bytes(endnote_content, "application/x-endnote+xml", &config)
+            .await;
+        assert!(result.is_ok());
+        let result = result.expect("Should extract EndNote XML format");
+        let metadata = &result.metadata;
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("format")),
+            Some(&serde_json::json!("EndNote XML"))
+        );
+    }
+}

data/vendor/kreuzberg/src/extractors/image.rs CHANGED Viewed

@@ -152,6 +152,7 @@ impl DocumentExtractor for ImageExtractor {
                     chunks: None,
                     images: None,
                     djot_content: None,
+                    elements: None,
                 });
             }
         }
@@ -181,10 +182,23 @@ impl DocumentExtractor for ImageExtractor {
             "image/png",
             "image/jpeg",
             "image/jpg",
+            "image/pjpeg",
             "image/webp",
             "image/bmp",
+            "image/x-bmp",
+            "image/x-ms-bmp",
             "image/tiff",
+            "image/x-tiff",
             "image/gif",
+            "image/jp2",
+            "image/jpx",
+            "image/jpm",
+            "image/mj2",
+            "image/x-jbig2",
+            "image/x-portable-anymap",
+            "image/x-portable-bitmap",
+            "image/x-portable-graymap",
+            "image/x-portable-pixmap",
         ]
     }
@@ -223,4 +237,15 @@ mod tests {
         let extractor = ImageExtractor;
         assert_eq!(extractor.name(), "image-extractor");
     }
+    #[test]
+    fn test_image_extractor_supports_alias_mime_types() {
+        let extractor = ImageExtractor::new();
+        let supported = extractor.supported_mime_types();
+        assert!(supported.contains(&"image/pjpeg"));
+        assert!(supported.contains(&"image/x-bmp"));
+        assert!(supported.contains(&"image/x-ms-bmp"));
+        assert!(supported.contains(&"image/x-tiff"));
+        assert!(supported.contains(&"image/x-portable-anymap"));
+    }
 }

data/vendor/kreuzberg/src/extractors/markdown.rs CHANGED Viewed

@@ -238,7 +238,14 @@ impl DocumentExtractor for MarkdownExtractor {
     }
     fn supported_mime_types(&self) -> &[&str] {
-        &["text/markdown", "text/x-markdown", "text/x-gfm", "text/x-commonmark"]
+        &[
+            "text/markdown",
+            "text/x-markdown",
+            "text/x-gfm",
+            "text/x-commonmark",
+            "text/x-markdown-extra",
+            "text/x-multimarkdown",
+        ]
     }
     fn priority(&self) -> i32 {
@@ -261,6 +268,8 @@ mod tests {
         assert!(mime_types.contains(&"text/x-markdown"));
         assert!(mime_types.contains(&"text/x-gfm"));
         assert!(mime_types.contains(&"text/x-commonmark"));
+        assert!(mime_types.contains(&"text/x-markdown-extra"));
+        assert!(mime_types.contains(&"text/x-multimarkdown"));
     }
     #[test]