RubyGems - kreuzberg - Versions diffs - 4.2.12 → 4.2.13 - Mend

kreuzberg 4.2.12 → 4.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +2 -2
data/vendor/kreuzberg/Cargo.toml +24 -7
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/core/config/extraction/core.rs +11 -0
data/vendor/kreuzberg/src/core/extractor/bytes.rs +7 -7
data/vendor/kreuzberg/src/core/extractor/file.rs +11 -11
data/vendor/kreuzberg/src/core/mime.rs +47 -2
data/vendor/kreuzberg/src/extraction/archive/gzip.rs +129 -0
data/vendor/kreuzberg/src/extraction/archive/mod.rs +147 -31
data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +44 -4
data/vendor/kreuzberg/src/extraction/archive/tar.rs +38 -3
data/vendor/kreuzberg/src/extraction/archive/zip.rs +37 -3
data/vendor/kreuzberg/src/extraction/image.rs +405 -18
data/vendor/kreuzberg/src/extraction/mod.rs +2 -2
data/vendor/kreuzberg/src/extractors/archive.rs +146 -15
data/vendor/kreuzberg/src/extractors/bibtex.rs +3 -2
data/vendor/kreuzberg/src/extractors/citation.rs +563 -0
data/vendor/kreuzberg/src/extractors/image.rs +25 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +10 -1
data/vendor/kreuzberg/src/extractors/mod.rs +21 -5
data/vendor/kreuzberg/src/extractors/opml/core.rs +2 -1
data/vendor/kreuzberg/src/extractors/security.rs +2 -1
data/vendor/kreuzberg/src/extractors/structured.rs +10 -3
data/vendor/kreuzberg/src/extractors/text.rs +33 -4
data/vendor/kreuzberg/src/extractors/xml.rs +12 -2
data/vendor/kreuzberg/src/ocr/processor/execution.rs +16 -3
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +4 -2

data/vendor/kreuzberg/src/extractors/mod.rs CHANGED Viewed

@@ -88,6 +88,9 @@ pub mod html;
 #[cfg(feature = "office")]
 pub mod bibtex;
+#[cfg(feature = "office")]
+pub mod citation;
 #[cfg(all(feature = "tokio-runtime", feature = "office"))]
 pub mod docx;
@@ -146,7 +149,7 @@ pub use text::{MarkdownExtractor, PlainTextExtractor};
 pub use image::ImageExtractor;
 #[cfg(feature = "archives")]
-pub use archive::{SevenZExtractor, TarExtractor, ZipExtractor};
+pub use archive::{GzipExtractor, SevenZExtractor, TarExtractor, ZipExtractor};
 #[cfg(feature = "email")]
 pub use email::EmailExtractor;
@@ -160,6 +163,9 @@ pub use html::HtmlExtractor;
 #[cfg(feature = "office")]
 pub use bibtex::BibtexExtractor;
+#[cfg(feature = "office")]
+pub use citation::CitationExtractor;
 #[cfg(all(feature = "tokio-runtime", feature = "office"))]
 pub use docx::DocxExtractor;
@@ -278,7 +284,11 @@ pub fn register_default_extractors() -> Result<()> {
     registry.register(Arc::new(ImageExtractor::new()))?;
     #[cfg(feature = "xml")]
-    registry.register(Arc::new(XmlExtractor::new()))?;
+    {
+        registry.register(Arc::new(XmlExtractor::new()))?;
+        registry.register(Arc::new(JatsExtractor::new()))?;
+        registry.register(Arc::new(DocbookExtractor::new()))?;
+    }
     #[cfg(feature = "pdf")]
     registry.register(Arc::new(PdfExtractor::new()))?;
@@ -292,6 +302,7 @@ pub fn register_default_extractors() -> Result<()> {
     {
         registry.register(Arc::new(EnhancedMarkdownExtractor::new()))?;
         registry.register(Arc::new(BibtexExtractor::new()))?;
+        registry.register(Arc::new(CitationExtractor::new()))?;
         registry.register(Arc::new(EpubExtractor::new()))?;
         registry.register(Arc::new(FictionBookExtractor::new()))?;
         registry.register(Arc::new(RtfExtractor::new()))?;
@@ -321,6 +332,7 @@ pub fn register_default_extractors() -> Result<()> {
         registry.register(Arc::new(ZipExtractor::new()))?;
         registry.register(Arc::new(TarExtractor::new()))?;
         registry.register(Arc::new(SevenZExtractor::new()))?;
+        registry.register(Arc::new(GzipExtractor::new()))?;
     }
     Ok(())
@@ -362,8 +374,10 @@ mod tests {
         #[cfg(feature = "xml")]
         {
-            expected_count += 1;
+            expected_count += 3;
             assert!(extractor_names.contains(&"xml-extractor".to_string()));
+            assert!(extractor_names.contains(&"jats-extractor".to_string()));
+            assert!(extractor_names.contains(&"docbook-extractor".to_string()));
         }
         #[cfg(feature = "pdf")]
@@ -380,9 +394,10 @@ mod tests {
         #[cfg(feature = "office")]
         {
-            expected_count += 10;
+            expected_count += 11;
             assert!(extractor_names.contains(&"markdown-extractor".to_string()));
             assert!(extractor_names.contains(&"bibtex-extractor".to_string()));
+            assert!(extractor_names.contains(&"citation-extractor".to_string()));
             assert!(extractor_names.contains(&"epub-extractor".to_string()));
             assert!(extractor_names.contains(&"fictionbook-extractor".to_string()));
             assert!(extractor_names.contains(&"rtf-extractor".to_string()));
@@ -416,10 +431,11 @@ mod tests {
         #[cfg(feature = "archives")]
         {
-            expected_count += 3;
+            expected_count += 4;
             assert!(extractor_names.contains(&"zip-extractor".to_string()));
             assert!(extractor_names.contains(&"tar-extractor".to_string()));
             assert!(extractor_names.contains(&"7z-extractor".to_string()));
+            assert!(extractor_names.contains(&"gzip-extractor".to_string()));
         }
         assert_eq!(

data/vendor/kreuzberg/src/extractors/opml/core.rs CHANGED Viewed

@@ -95,7 +95,7 @@ impl DocumentExtractor for OpmlExtractor {
     }
     fn supported_mime_types(&self) -> &[&str] {
-        &["text/x-opml", "application/xml+opml"]
+        &["text/x-opml", "application/xml+opml", "application/x-opml+xml"]
     }
     fn priority(&self) -> i32 {
@@ -135,6 +135,7 @@ mod tests {
         let supported = extractor.supported_mime_types();
         assert!(supported.contains(&"text/x-opml"));
         assert!(supported.contains(&"application/xml+opml"));
+        assert!(supported.contains(&"application/x-opml+xml"));
     }
     #[tokio::test]

data/vendor/kreuzberg/src/extractors/security.rs CHANGED Viewed

@@ -14,7 +14,8 @@ use std::io::{Read, Seek};
 ///
 /// All limits are intentionally conservative to prevent DoS attacks
 /// while still supporting legitimate documents.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
+#[serde(default)]
 pub struct SecurityLimits {
     /// Maximum uncompressed size for archives (500 MB)
     pub max_archive_size: usize,

data/vendor/kreuzberg/src/extractors/structured.rs CHANGED Viewed

@@ -59,8 +59,12 @@ impl DocumentExtractor for StructuredExtractor {
         _config: &ExtractionConfig,
     ) -> Result<ExtractionResult> {
         let structured_result = match mime_type {
-            "application/json" | "text/json" => crate::extraction::structured::parse_json(content, None)?,
-            "application/x-yaml" | "text/yaml" | "text/x-yaml" => crate::extraction::structured::parse_yaml(content)?,
+            "application/json" | "text/json" | "application/csl+json" => {
+                crate::extraction::structured::parse_json(content, None)?
+            }
+            "application/yaml" | "application/x-yaml" | "text/yaml" | "text/x-yaml" => {
+                crate::extraction::structured::parse_yaml(content)?
+            }
             "application/toml" | "text/toml" => crate::extraction::structured::parse_toml(content)?,
             _ => return Err(crate::KreuzbergError::UnsupportedFormat(mime_type.to_string())),
         };
@@ -112,6 +116,8 @@ impl DocumentExtractor for StructuredExtractor {
         &[
             "application/json",
             "text/json",
+            "application/csl+json",
+            "application/yaml",
             "application/x-yaml",
             "text/yaml",
             "text/x-yaml",
@@ -141,9 +147,10 @@ mod tests {
     fn test_structured_extractor_supported_mime_types() {
         let extractor = StructuredExtractor::new();
         let mime_types = extractor.supported_mime_types();
-        assert_eq!(mime_types.len(), 7);
+        assert_eq!(mime_types.len(), 9);
         assert!(mime_types.contains(&"application/json"));
         assert!(mime_types.contains(&"application/x-yaml"));
         assert!(mime_types.contains(&"application/toml"));
+        assert!(mime_types.contains(&"application/csl+json"));
     }
 }

data/vendor/kreuzberg/src/extractors/text.rs CHANGED Viewed

@@ -97,7 +97,15 @@ impl DocumentExtractor for PlainTextExtractor {
     }
     fn supported_mime_types(&self) -> &[&str] {
-        &["text/plain", "text/csv", "text/tab-separated-values"]
+        &[
+            "text/plain",
+            "text/csv",
+            "text/tab-separated-values",
+            "text/troff",
+            "text/x-mdoc",
+            "text/x-pod",
+            "text/x-dokuwiki",
+        ]
     }
     fn priority(&self) -> i32 {
@@ -192,7 +200,12 @@ impl DocumentExtractor for MarkdownExtractor {
     }
     fn supported_mime_types(&self) -> &[&str] {
-        &["text/markdown", "text/x-markdown"]
+        &[
+            "text/markdown",
+            "text/x-markdown",
+            "text/x-markdown-extra",
+            "text/x-multimarkdown",
+        ]
     }
     fn priority(&self) -> i32 {
@@ -253,7 +266,15 @@ mod tests {
         assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
         assert_eq!(
             extractor.supported_mime_types(),
-            &["text/plain", "text/csv", "text/tab-separated-values"]
+            &[
+                "text/plain",
+                "text/csv",
+                "text/tab-separated-values",
+                "text/troff",
+                "text/x-mdoc",
+                "text/x-pod",
+                "text/x-dokuwiki",
+            ]
         );
         assert_eq!(extractor.priority(), 50);
     }
@@ -263,7 +284,15 @@ mod tests {
         let extractor = MarkdownExtractor::new();
         assert_eq!(extractor.name(), "markdown-extractor");
         assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
-        assert_eq!(extractor.supported_mime_types(), &["text/markdown", "text/x-markdown"]);
+        assert_eq!(
+            extractor.supported_mime_types(),
+            &[
+                "text/markdown",
+                "text/x-markdown",
+                "text/x-markdown-extra",
+                "text/x-multimarkdown"
+            ]
+        );
         assert_eq!(extractor.priority(), 50);
     }
 }

data/vendor/kreuzberg/src/extractors/xml.rs CHANGED Viewed

@@ -96,7 +96,12 @@ impl DocumentExtractor for XmlExtractor {
     }
     fn supported_mime_types(&self) -> &[&str] {
-        &["application/xml", "text/xml", "image/svg+xml"]
+        &[
+            "application/xml",
+            "text/xml",
+            "image/svg+xml",
+            "application/x-endnote+xml",
+        ]
     }
     fn priority(&self) -> i32 {
@@ -142,7 +147,12 @@ mod tests {
         assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
         assert_eq!(
             extractor.supported_mime_types(),
-            &["application/xml", "text/xml", "image/svg+xml"]
+            &[
+                "application/xml",
+                "text/xml",
+                "image/svg+xml",
+                "application/x-endnote+xml"
+            ]
         );
         assert_eq!(extractor.priority(), 50);
     }

data/vendor/kreuzberg/src/ocr/processor/execution.rs CHANGED Viewed

@@ -72,8 +72,21 @@ pub(super) fn perform_ocr(
         )
     });
-    let img = image::load_from_memory(image_bytes)
-        .map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode image: {}", e)))?;
+    let img = {
+        // Check for JPEG 2000 format which the image crate doesn't support
+        if crate::extraction::image::is_jp2(image_bytes) || crate::extraction::image::is_j2k(image_bytes) {
+            crate::extraction::image::decode_jp2_to_rgb(image_bytes)
+                .map(image::DynamicImage::ImageRgb8)
+                .map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode JP2 image: {}", e)))?
+        } else if crate::extraction::image::is_jbig2(image_bytes) {
+            crate::extraction::image::decode_jbig2_to_gray(image_bytes)
+                .map(image::DynamicImage::ImageLuma8)
+                .map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode JBIG2 image: {}", e)))?
+        } else {
+            image::load_from_memory(image_bytes)
+                .map_err(|e| OcrError::ImageProcessingFailed(format!("Failed to decode image: {}", e)))?
+        }
+    };
     let rgb_image = img.to_rgb8();
     let (width, height) = rgb_image.dimensions();
@@ -224,7 +237,7 @@ pub(super) fn perform_ocr(
         "tsv" => {
             let tsv = tsv_data_for_tables
                 .as_ref()
-                .expect("TSV data should be extracted when output_format is 'tsv'")
+                .ok_or_else(|| OcrError::ProcessingFailed("TSV data not available".to_string()))?
                 .clone();
             (tsv, "text/plain".to_string())
         }

data/vendor/kreuzberg-tesseract/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-tesseract"
-version = "4.2.12"
+version = "4.2.13"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kreuzberg
 version: !ruby/object:Gem::Version
-  version: 4.2.12
+  version: 4.2.13
 platform: ruby
 authors:
 - Na'aman Hirschfeld
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-02-06 00:00:00.000000000 Z
+date: 2026-02-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys
@@ -422,6 +422,7 @@ files:
 - vendor/kreuzberg/src/core/server_config/validation.rs
 - vendor/kreuzberg/src/embeddings.rs
 - vendor/kreuzberg/src/error.rs
+- vendor/kreuzberg/src/extraction/archive/gzip.rs
 - vendor/kreuzberg/src/extraction/archive/mod.rs
 - vendor/kreuzberg/src/extraction/archive/sevenz.rs
 - vendor/kreuzberg/src/extraction/archive/tar.rs
@@ -463,6 +464,7 @@ files:
 - vendor/kreuzberg/src/extraction/xml.rs
 - vendor/kreuzberg/src/extractors/archive.rs
 - vendor/kreuzberg/src/extractors/bibtex.rs
+- vendor/kreuzberg/src/extractors/citation.rs
 - vendor/kreuzberg/src/extractors/djot_format/attributes.rs
 - vendor/kreuzberg/src/extractors/djot_format/conversion.rs
 - vendor/kreuzberg/src/extractors/djot_format/extractor.rs