RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

checksums.yaml +4 -4
data/Gemfile.lock +5 -5
data/README.md +15 -9
data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
data/kreuzberg.gemspec +38 -4
data/lib/kreuzberg/config.rb +34 -1
data/lib/kreuzberg/result.rb +77 -14
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +23 -6
data/vendor/kreuzberg/Cargo.toml +32 -11
data/vendor/kreuzberg/README.md +54 -8
data/vendor/kreuzberg/build.rs +549 -132
data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
data/vendor/kreuzberg/src/core/config.rs +49 -1
data/vendor/kreuzberg/src/core/extractor.rs +134 -2
data/vendor/kreuzberg/src/core/mod.rs +4 -2
data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
data/vendor/kreuzberg/src/extraction/html.rs +24 -8
data/vendor/kreuzberg/src/extraction/image.rs +124 -1
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
data/vendor/kreuzberg/src/extractors/email.rs +29 -15
data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
data/vendor/kreuzberg/src/extractors/html.rs +29 -15
data/vendor/kreuzberg/src/extractors/image.rs +25 -4
data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
data/vendor/kreuzberg/src/extractors/text.rs +7 -2
data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
data/vendor/kreuzberg/src/lib.rs +10 -2
data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
data/vendor/kreuzberg/src/mcp/server.rs +120 -12
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
data/vendor/kreuzberg/src/pdf/error.rs +8 -0
data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
data/vendor/kreuzberg/src/pdf/table.rs +26 -2
data/vendor/kreuzberg/src/pdf/text.rs +89 -7
data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
data/vendor/kreuzberg/src/text/mod.rs +6 -0
data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
data/vendor/kreuzberg/src/types.rs +173 -21
data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
data/vendor/kreuzberg/tests/config_features.rs +15 -1
data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
data/vendor/kreuzberg/tests/email_integration.rs +2 -0
data/vendor/kreuzberg/tests/error_handling.rs +43 -34
data/vendor/kreuzberg/tests/format_integration.rs +2 -0
data/vendor/kreuzberg/tests/image_integration.rs +2 -0
data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -0
data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
data/vendor/rb-sys/Cargo.lock +15 -15
data/vendor/rb-sys/Cargo.toml +4 -4
data/vendor/rb-sys/Cargo.toml.orig +4 -4
data/vendor/rb-sys/build/features.rs +5 -2
data/vendor/rb-sys/build/main.rs +55 -15
data/vendor/rb-sys/build/stable_api_config.rs +4 -2
data/vendor/rb-sys/build/version.rs +3 -1
data/vendor/rb-sys/src/lib.rs +1 -0
data/vendor/rb-sys/src/macros.rs +2 -2
data/vendor/rb-sys/src/special_consts.rs +1 -1
data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
data/vendor/rb-sys/src/stable_api.rs +0 -1
data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
metadata +13 -10
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316

data/vendor/kreuzberg/tests/docbook_extractor_tests.rs CHANGED Viewed

@@ -1,5 +1,7 @@
 //! Comprehensive tests for DocBook extractor supporting both 4.x and 5.x versions.
+#![cfg(feature = "xml")]
 use kreuzberg::core::config::ExtractionConfig;
 use kreuzberg::plugins::{DocumentExtractor, Plugin};
 use std::path::PathBuf;

data/vendor/kreuzberg/tests/email_integration.rs CHANGED Viewed

@@ -3,6 +3,8 @@
 //! Tests for .eml (RFC822) email extraction.
 //! Validates metadata extraction, content extraction, HTML/plain text handling, and attachments.
+#![cfg(feature = "email")]
 use kreuzberg::core::config::ExtractionConfig;
 use kreuzberg::core::extractor::extract_bytes;

data/vendor/kreuzberg/tests/error_handling.rs CHANGED Viewed

@@ -12,6 +12,7 @@ mod helpers;
 /// Test truncated PDF - incomplete PDF file.
 #[tokio::test]
+#[cfg(feature = "pdf")]
 async fn test_truncated_pdf() {
     let config = ExtractionConfig::default();
@@ -31,6 +32,7 @@ async fn test_truncated_pdf() {
 /// Test corrupted ZIP - malformed archive.
 #[tokio::test]
+#[cfg(feature = "archives")]
 async fn test_corrupted_zip() {
     let config = ExtractionConfig::default();
@@ -50,6 +52,7 @@ async fn test_corrupted_zip() {
 /// Test invalid XML - bad XML syntax.
 #[tokio::test]
+#[cfg(feature = "xml")]
 async fn test_invalid_xml() {
     let config = ExtractionConfig::default();
@@ -80,6 +83,7 @@ async fn test_invalid_xml() {
 /// Test corrupted image - invalid image data.
 #[tokio::test]
+#[cfg(feature = "ocr")]
 async fn test_corrupted_image() {
     let config = ExtractionConfig::default();
@@ -112,27 +116,28 @@ async fn test_empty_file() {
     let empty_data = b"";
-    let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
     let result_text = extract_bytes(empty_data, "text/plain", &config).await;
-    let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
-    match result_pdf {
-        Ok(extraction) => {
-            assert!(
-                extraction.content.is_empty(),
-                "Empty PDF should have empty content if it succeeds"
-            );
-            assert!(extraction.chunks.is_none(), "Chunks should be None");
-        }
-        Err(error) => {
-            assert!(
-                matches!(
-                    error,
-                    kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
-                ),
-                "Empty PDF should produce Parsing or Validation error, got: {:?}",
-                error
-            );
+    #[cfg(feature = "pdf")]
+    {
+        let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
+        match result_pdf {
+            Ok(extraction) => {
+                assert!(
+                    extraction.content.is_empty(),
+                    "Empty PDF should have empty content if it succeeds"
+                );
+                assert!(extraction.chunks.is_none(), "Chunks should be None");
+            }
+            Err(error) => {
+                assert!(
+                    matches!(
+                        error,
+                        kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
+                    ),
+                    "Empty PDF should produce Parsing or Validation error, got: {:?}",
+                    error
+                );
+            }
         }
     }
@@ -149,20 +154,24 @@ async fn test_empty_file() {
         }
     }
-    match result_xml {
-        Ok(extraction) => {
-            assert!(
-                extraction.content.is_empty(),
-                "Empty XML should have empty content if it succeeds"
-            );
-            assert!(extraction.chunks.is_none(), "Chunks should be None");
-        }
-        Err(error) => {
-            assert!(
-                matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
-                "Empty XML error should be Parsing type, got: {:?}",
-                error
-            );
+    #[cfg(feature = "xml")]
+    {
+        let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
+        match result_xml {
+            Ok(extraction) => {
+                assert!(
+                    extraction.content.is_empty(),
+                    "Empty XML should have empty content if it succeeds"
+                );
+                assert!(extraction.chunks.is_none(), "Chunks should be None");
+            }
+            Err(error) => {
+                assert!(
+                    matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
+                    "Empty XML error should be Parsing type, got: {:?}",
+                    error
+                );
+            }
         }
     }
 }

data/vendor/kreuzberg/tests/format_integration.rs CHANGED Viewed

@@ -5,6 +5,8 @@
 //! asynchronous APIs or to graceful handling when optional system
 //! dependencies are missing.
+#![cfg(any(feature = "pdf", feature = "office", feature = "ocr"))]
 mod helpers;
 use helpers::{assert_mime_type, assert_non_empty_content, get_test_file_path, test_documents_available};

data/vendor/kreuzberg/tests/image_integration.rs CHANGED Viewed

@@ -11,6 +11,8 @@
 //! - Test OCR with various languages and layouts
 //! - Verify graceful handling of images without text
+#![cfg(feature = "ocr")]
 mod helpers;
 use helpers::*;

data/vendor/kreuzberg/tests/mime_detection.rs CHANGED Viewed

@@ -276,22 +276,23 @@ async fn test_no_extension() {
     let detected = detect_mime_type(&temp_path, true);
-    if detected.is_err() {
-        let error = detected.unwrap_err();
-        assert!(
-            matches!(
-                error,
-                kreuzberg::KreuzbergError::Validation { .. } | kreuzberg::KreuzbergError::UnsupportedFormat(_)
-            ),
-            "Should return appropriate error for file without extension"
-        );
-    } else {
-        let mime = detected.unwrap();
-        assert!(
-            mime.contains('/'),
-            "Detected MIME type should be valid format: {}",
-            mime
-        );
+    match detected {
+        Err(error) => {
+            assert!(
+                matches!(
+                    error,
+                    kreuzberg::KreuzbergError::Validation { .. } | kreuzberg::KreuzbergError::UnsupportedFormat(_)
+                ),
+                "Should return appropriate error for file without extension"
+            );
+        }
+        Ok(mime) => {
+            assert!(
+                mime.contains('/'),
+                "Detected MIME type should be valid format: {}",
+                mime
+            );
+        }
     }
     let _ = std::fs::remove_file(&temp_path);

data/vendor/kreuzberg/tests/ocr_configuration.rs CHANGED Viewed

@@ -11,6 +11,8 @@
 //! - Verify configuration changes actually affect output
 //! - Test table detection with various settings
+#![cfg(feature = "ocr")]
 mod helpers;
 use helpers::*;
@@ -204,6 +206,7 @@ fn test_ocr_psm_single_line() {
 }
 #[test]
+#[cfg(feature = "pdf")]
 fn test_force_ocr_on_text_pdf() {
     if skip_if_missing("pdfs/fake_memo.pdf") {
         return;
@@ -233,6 +236,7 @@ fn test_force_ocr_on_text_pdf() {
 }
 #[test]
+#[cfg(feature = "pdf")]
 fn test_force_ocr_disabled() {
     if skip_if_missing("pdfs/fake_memo.pdf") {
         return;

data/vendor/kreuzberg/tests/ocr_errors.rs CHANGED Viewed

@@ -13,6 +13,8 @@
 //! - Test recovery from transient failures
 //! - Validate resource limits and constraints
+#![cfg(feature = "ocr")]
 mod helpers;
 use helpers::*;
@@ -451,6 +453,9 @@ fn test_ocr_cache_disabled_then_enabled() {
     };
     let result1 = extract_file_sync(&file_path, None, &config_no_cache);
+    if matches!(result1, Err(KreuzbergError::MissingDependency(_))) {
+        return;
+    }
     assert!(result1.is_ok(), "First extraction should succeed");
     let config_with_cache = ExtractionConfig {
@@ -468,6 +473,9 @@ fn test_ocr_cache_disabled_then_enabled() {
     };
     let result2 = extract_file_sync(&file_path, None, &config_with_cache);
+    if matches!(result2, Err(KreuzbergError::MissingDependency(_))) {
+        return;
+    }
     assert!(result2.is_ok(), "Second extraction should succeed");
     assert_non_empty_content(&result1.unwrap());
@@ -495,6 +503,13 @@ fn test_ocr_concurrent_same_file() {
         ..Default::default()
     });
+    if matches!(
+        extract_file_sync(&*file_path, None, &config),
+        Err(KreuzbergError::MissingDependency(_))
+    ) {
+        return;
+    }
     let mut handles = vec![];
     for i in 0..5 {
         let file_path_clone = Arc::clone(&file_path);
@@ -554,6 +569,13 @@ fn test_ocr_concurrent_different_files() {
         ..Default::default()
     });
+    if matches!(
+        extract_file_sync(&files[0], None, &config),
+        Err(KreuzbergError::MissingDependency(_))
+    ) {
+        return;
+    }
     let mut handles = vec![];
     for (i, file_path) in files.iter().enumerate() {
         let file_path_clone = file_path.clone();

data/vendor/kreuzberg/tests/ocr_quality.rs CHANGED Viewed

@@ -14,6 +14,8 @@
 //! - Verify layout preservation (line counts, structure)
 //! - Assert minimum quality thresholds
+#![cfg(all(feature = "ocr", feature = "pdf"))]
 mod helpers;
 use helpers::*;

data/vendor/kreuzberg/tests/odt_extractor_tests.rs CHANGED Viewed

@@ -69,16 +69,13 @@ async fn test_odt_metadata_extraction() {
         "Should contain document title in content"
     );
-    // Verify metadata extraction
     let metadata = &result.metadata.additional;
     println!("Extracted metadata: {:?}", metadata);
-    // Check title
     if let Some(title) = metadata.get("title") {
         assert_eq!(title.as_str(), Some("Test Metadata Document"), "Title should match");
     }
-    // Check subject
     if let Some(subject) = metadata.get("subject") {
         assert_eq!(
             subject.as_str(),
@@ -87,28 +84,23 @@ async fn test_odt_metadata_extraction() {
         );
     }
-    // Check creator/author
     if let Some(created_by) = metadata.get("created_by") {
         assert_eq!(created_by.as_str(), Some("John Doe"), "Creator should match");
     }
-    // Check authors array
     if let Some(authors) = metadata.get("authors") {
         let authors_array = authors.as_array().expect("Authors should be an array");
         assert_eq!(authors_array.len(), 1, "Should have one author");
         assert_eq!(authors_array[0].as_str(), Some("John Doe"), "Author name should match");
     }
-    // Check creation date (should exist)
     assert!(metadata.get("created_at").is_some(), "Creation date should be present");
-    // Check modification date (should exist)
     assert!(
         metadata.get("modified_at").is_some(),
         "Modification date should be present"
     );
-    // Check generator
     if let Some(generator) = metadata.get("generator") {
         let gen_str = generator.as_str().expect("Generator should be a string");
         assert!(gen_str.contains("Pandoc"), "Generator should be Pandoc");
@@ -604,18 +596,11 @@ async fn test_odt_table_no_duplicate_content() {
     assert!(!result.content.is_empty(), "Content should not be empty");
-    // Count how many times we see "Content" in the output
-    // In a properly fixed version, it should appear only once in the markdown table
-    // or possibly twice if headers appear with the same name, but not multiple times
-    // for the same cell
     let content_count = result.content.matches("Content").count();
-    // "Content" appears twice in the header "More content" in a simple table
-    // It should not appear more than 3 times (once in header, once in data cell, once in a different word like "More content")
     println!("   'Content' appears {} times in output", content_count);
     println!("   Content preview:\n{}", result.content);
-    // This verifies that we're not getting duplicate cell content extracted
     assert!(
         content_count <= 3,
         "Content should not appear excessively, indicating no duplicate table cell extraction"
@@ -628,7 +613,6 @@ async fn test_odt_table_no_duplicate_content() {
 /// Uses the extraction_test document created with pandoc to ensure complete content
 #[tokio::test]
 async fn test_odt_comprehensive_table_extraction() {
-    // This test uses the pandoc-generated test document
     let test_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
         .parent()
         .unwrap()
@@ -648,7 +632,6 @@ async fn test_odt_comprehensive_table_extraction() {
     assert!(!result.content.is_empty(), "Content should not be empty");
-    // Verify all sections are present
     assert!(result.content.contains("Comprehensive"), "Should contain heading");
     assert!(
         result.content.contains("First Section") || result.content.contains("First"),
@@ -663,14 +646,12 @@ async fn test_odt_comprehensive_table_extraction() {
         "Should contain third section"
     );
-    // Verify tables are present and formatted correctly (as markdown)
     assert!(
         result.content.contains("|"),
         "Should contain pipe characters for markdown tables"
     );
     assert!(result.content.contains("---"), "Should contain table separator");
-    // Verify table content is extracted
     assert!(
         result.content.contains("Header 1") || result.content.contains("Cell 1A"),
         "Should contain table data"
@@ -680,8 +661,6 @@ async fn test_odt_comprehensive_table_extraction() {
         "Should contain second table data"
     );
-    // Verify no excessive duplication of cells (a simple heuristic check)
-    // Count "Cell 1A" - should appear once or twice at most
     let cell_count = result.content.matches("Cell 1A").count();
     assert!(
         cell_count <= 2,

data/vendor/kreuzberg/tests/pdf_integration.rs CHANGED Viewed

@@ -4,6 +4,8 @@
 //! multi-language E2E generator. This module keeps only the cases that
 //! exercise Rust-specific failure handling or error propagation.
+#![cfg(feature = "pdf")]
 mod helpers;
 use helpers::*;