RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +11 -11
data/README.md +5 -10
data/examples/async_patterns.rb +0 -1
data/ext/kreuzberg_rb/extconf.rb +0 -10
data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
data/ext/kreuzberg_rb/native/build.rs +2 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
data/kreuzberg.gemspec +14 -57
data/lib/kreuzberg/cache_api.rb +0 -1
data/lib/kreuzberg/cli.rb +2 -2
data/lib/kreuzberg/config.rb +2 -9
data/lib/kreuzberg/errors.rb +7 -75
data/lib/kreuzberg/extraction_api.rb +0 -1
data/lib/kreuzberg/setup_lib_path.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -21
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +3 -55
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/spec_helper.rb +1 -1
data/vendor/kreuzberg/Cargo.toml +42 -112
data/vendor/kreuzberg/README.md +2 -2
data/vendor/kreuzberg/build.rs +4 -18
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/extractor.rs +81 -202
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +1 -4
data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
data/vendor/kreuzberg/src/embeddings.rs +16 -125
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extraction/image.rs +13 -13
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
data/vendor/kreuzberg/src/extractors/email.rs +0 -14
data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
data/vendor/kreuzberg/src/extractors/html.rs +154 -137
data/vendor/kreuzberg/src/extractors/image.rs +4 -7
data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
data/vendor/kreuzberg/src/extractors/text.rs +5 -23
data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/lib.rs +1 -4
data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
data/vendor/kreuzberg/src/mcp/server.rs +3 -5
data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
data/vendor/kreuzberg/src/pdf/error.rs +1 -1
data/vendor/kreuzberg/src/pdf/table.rs +44 -17
data/vendor/kreuzberg/src/pdf/text.rs +3 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/types.rs +12 -42
data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
data/vendor/kreuzberg/tests/config_features.rs +0 -18
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
data/vendor/kreuzberg/tests/core_integration.rs +7 -24
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -12
metadata +25 -90
data/.rubocop.yml +0 -538
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
data/lib/kreuzberg/error_context.rb +0 -32
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/LICENSE-MIT +0 -21
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/vendor/kreuzberg/tests/pandoc_integration.rs ADDED Viewed

@@ -0,0 +1,503 @@
+//! Pandoc integration tests.
+//!
+//! Tests for Pandoc-based document extraction (RST, LaTeX, ODT, RTF).
+//! Validates that Pandoc integration works when available and degrades gracefully when missing.
+//!
+//! Note: These tests require the `office` feature to be enabled.
+#![cfg(feature = "office")]
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::extractor::extract_bytes;
+use kreuzberg::extraction::pandoc::validate_pandoc_version;
+mod helpers;
+/// Check if Pandoc is installed and available.
+async fn is_pandoc_available() -> bool {
+    validate_pandoc_version().await.is_ok()
+}
+/// Test reStructuredText (RST) extraction.
+#[tokio::test]
+async fn test_rst_extraction() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let rst_content = b"Title
+=====
+This is a paragraph in reStructuredText.
+Section Heading
+---------------
+- Bullet point 1
+- Bullet point 2
+- Bullet point 3
+**Bold text** and *italic text*.";
+    let result = extract_bytes(rst_content, "text/x-rst", &config).await;
+    assert!(result.is_ok(), "RST extraction should succeed");
+    let extraction = result.unwrap();
+    assert_eq!(extraction.mime_type, "text/x-rst");
+    assert!(!extraction.content.is_empty(), "Content should not be empty");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(extraction.tables.is_empty(), "RST should not extract tables");
+    assert!(extraction.content.contains("Title"), "Should extract title");
+    assert!(
+        extraction.content.contains("paragraph"),
+        "Should extract paragraph text"
+    );
+    assert!(
+        extraction.content.contains("Section Heading"),
+        "Should extract section heading"
+    );
+    assert!(
+        extraction.content.contains("Bullet point 1") || extraction.content.contains("point 1"),
+        "Should extract bullet points"
+    );
+    assert!(
+        extraction.content.contains("Bold text") || extraction.content.contains("italic text"),
+        "Should extract formatted text content"
+    );
+    let content_lower = extraction.content.to_lowercase();
+    assert!(content_lower.contains("title"), "Should extract title");
+    assert!(content_lower.contains("section"), "Should extract section heading");
+    assert!(content_lower.contains("bullet"), "Should extract bullet list");
+}
+/// Test LaTeX extraction.
+#[tokio::test]
+async fn test_latex_extraction() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let latex_content = b"\\documentclass{article}
+\\begin{document}
+\\title{Test Document}
+\\author{Test Author}
+\\maketitle
+\\section{Introduction}
+This is a test LaTeX document with \\textbf{bold} and \\textit{italic} text.
+\\subsection{Subsection}
+Some content in a subsection.
+\\end{document}";
+    let result = extract_bytes(latex_content, "application/x-latex", &config).await;
+    assert!(result.is_ok(), "LaTeX extraction should succeed");
+    let extraction = result.unwrap();
+    assert_eq!(extraction.mime_type, "application/x-latex");
+    assert!(!extraction.content.is_empty(), "Content should not be empty");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(
+        extraction.tables.is_empty(),
+        "LaTeX should not extract tables in this test"
+    );
+    assert!(
+        extraction.content.contains("Test Document"),
+        "Should extract document title"
+    );
+    assert!(
+        extraction.content.contains("Introduction"),
+        "Should extract section heading"
+    );
+    assert!(
+        extraction.content.contains("Subsection"),
+        "Should extract subsection heading"
+    );
+    assert!(
+        extraction.content.contains("test LaTeX document"),
+        "Should extract paragraph text"
+    );
+    assert!(
+        !extraction.content.contains("\\textbf") && !extraction.content.contains("\\section"),
+        "LaTeX commands should be stripped, not included in output"
+    );
+}
+/// Test OpenDocument Text (ODT) extraction.
+#[tokio::test]
+async fn test_odt_extraction() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let invalid_odt = b"This is not a valid ODT file";
+    let result = extract_bytes(invalid_odt, "application/vnd.oasis.opendocument.text", &config).await;
+    assert!(result.is_err(), "Invalid ODT should fail gracefully");
+    let error = result.unwrap_err();
+    match error {
+        kreuzberg::KreuzbergError::Parsing { .. } => {}
+        kreuzberg::KreuzbergError::Io(_) => {}
+        other => panic!("Expected Parsing or Io error, got: {:?}", other),
+    }
+}
+/// Test Rich Text Format (RTF) extraction.
+#[tokio::test]
+async fn test_rtf_extraction() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let rtf_content = b"{\\rtf1\\ansi\\deff0
+{\\fonttbl{\\f0 Times New Roman;}}
+\\f0\\fs24 This is a test RTF document.\\par
+\\b Bold text\\b0  and \\i italic text\\i0.\\par
+}";
+    let result = extract_bytes(rtf_content, "application/rtf", &config).await;
+    assert!(result.is_ok(), "RTF extraction should succeed");
+    let extraction = result.unwrap();
+    assert_eq!(extraction.mime_type, "application/rtf");
+    assert!(!extraction.content.is_empty(), "Content should not be empty");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(
+        extraction.tables.is_empty(),
+        "RTF should not extract tables in this test"
+    );
+    assert!(
+        extraction.content.contains("test RTF document"),
+        "Should extract main paragraph"
+    );
+    assert!(
+        extraction.content.contains("Bold text") || extraction.content.contains("Bold"),
+        "Should extract bold text"
+    );
+    assert!(
+        extraction.content.contains("italic text") || extraction.content.contains("italic"),
+        "Should extract italic text"
+    );
+    assert!(
+        !extraction.content.contains("\\rtf") && !extraction.content.contains("\\par"),
+        "RTF control codes should be stripped from output"
+    );
+}
+/// Test graceful degradation when Pandoc is not installed.
+#[tokio::test]
+async fn test_pandoc_not_installed() {
+    let validation_result = validate_pandoc_version().await;
+    if validation_result.is_ok() {
+        println!("Pandoc is installed - skipping 'not installed' test");
+        return;
+    }
+    assert!(
+        validation_result.is_err(),
+        "Should return error when Pandoc not installed"
+    );
+}
+/// Test Pandoc conversion error handling.
+#[tokio::test]
+async fn test_pandoc_conversion_error() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let malformed_rst = b"===\nThis is malformed\n===\n===";
+    let result = extract_bytes(malformed_rst, "text/x-rst", &config).await;
+    assert!(
+        result.is_ok() || result.is_err(),
+        "Should handle malformed content gracefully"
+    );
+}
+/// Test EPUB extraction (ebook format).
+#[tokio::test]
+async fn test_epub_extraction() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let invalid_epub = b"This is not a valid EPUB file";
+    let result = extract_bytes(invalid_epub, "application/epub+zip", &config).await;
+    assert!(result.is_err(), "Invalid EPUB should fail gracefully");
+    let error = result.unwrap_err();
+    match error {
+        kreuzberg::KreuzbergError::Parsing { .. } => {}
+        kreuzberg::KreuzbergError::Io(_) => {}
+        other => panic!("Expected Parsing or Io error for invalid EPUB, got: {:?}", other),
+    }
+}
+/// Test Org mode extraction.
+#[tokio::test]
+async fn test_org_mode_extraction() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let org_content = b"* Top Level Heading
+This is a paragraph in Org mode.
+** Second Level Heading
+- Item 1
+- Item 2
+- Item 3
+*bold text* and /italic text/";
+    let result = extract_bytes(org_content, "text/x-org", &config).await;
+    assert!(result.is_ok(), "Org mode extraction should succeed");
+    let extraction = result.unwrap();
+    assert!(!extraction.content.is_empty(), "Content should not be empty");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(
+        extraction.tables.is_empty(),
+        "Org mode should not extract tables in this test"
+    );
+    assert!(
+        extraction.content.contains("Top Level") || extraction.content.contains("paragraph"),
+        "Org mode content should be extracted"
+    );
+    assert!(
+        extraction.content.contains("paragraph") || extraction.content.contains("Heading"),
+        "Text content should be present"
+    );
+}
+/// Test Typst extraction (new document format).
+#[tokio::test]
+async fn test_typst_extraction() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let typst_content = b"= Heading
+This is a paragraph in Typst.
+== Subheading
+#strong[Bold text] and #emph[italic text].";
+    let result = extract_bytes(typst_content, "application/x-typst", &config).await;
+    assert!(
+        result.is_ok() || result.is_err(),
+        "Should handle Typst gracefully (may not be supported in all Pandoc versions)"
+    );
+}
+/// Test CommonMark extraction.
+#[tokio::test]
+async fn test_commonmark_extraction() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let commonmark_content = b"# Heading
+This is a paragraph in CommonMark.
+## Subheading
+- List item 1
+- List item 2
+**Bold** and *italic* text.";
+    let result = extract_bytes(commonmark_content, "text/x-commonmark", &config).await;
+    assert!(result.is_ok(), "CommonMark extraction should succeed");
+    let extraction = result.unwrap();
+    assert!(!extraction.content.is_empty(), "Content should not be empty");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(
+        extraction.tables.is_empty(),
+        "CommonMark should not extract tables in this test"
+    );
+    assert!(
+        extraction.content.contains("Heading") || extraction.content.contains("paragraph"),
+        "CommonMark content should be extracted"
+    );
+    let content_lower = extraction.content.to_lowercase();
+    assert!(
+        content_lower.contains("heading") || content_lower.contains("paragraph"),
+        "Should extract text"
+    );
+    assert!(
+        content_lower.contains("list") || content_lower.contains("item"),
+        "Should extract list items"
+    );
+}
+/// Test empty content.
+#[tokio::test]
+async fn test_pandoc_empty_content() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let empty_rst = b"";
+    let result = extract_bytes(empty_rst, "text/x-rst", &config).await;
+    if let Ok(extraction) = result {
+        assert!(
+            extraction.content.is_empty() || extraction.content.trim().is_empty(),
+            "Empty input should produce empty or minimal output"
+        );
+    }
+}
+/// Test Unicode content in Pandoc formats.
+#[tokio::test]
+async fn test_pandoc_unicode_content() {
+    if !is_pandoc_available().await {
+        println!("Skipping test: Pandoc not installed");
+        return;
+    }
+    let config = ExtractionConfig::default();
+    let unicode_rst = "Title with Unicode
+==================
+This document contains Unicode: 你好世界 🌍 café
+Section
+-------
+Arabic: مرحبا
+Emoji: 🎉 ✅ 🚀"
+        .as_bytes();
+    let result = extract_bytes(unicode_rst, "text/x-rst", &config).await;
+    assert!(result.is_ok(), "Unicode content should be handled");
+    let extraction = result.unwrap();
+    assert!(!extraction.content.is_empty(), "Content should be extracted");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(
+        extraction.tables.is_empty(),
+        "RST should not extract tables in this test"
+    );
+    assert!(
+        extraction.content.len() > 20,
+        "Should have substantial extracted content"
+    );
+}

data/vendor/kreuzberg/tests/pipeline_integration.rs CHANGED Viewed

@@ -850,6 +850,7 @@ async fn test_pipeline_multiple_processor_errors() {
     let result = run_pipeline(result, &config).await;
     assert!(result.is_err(), "Expected pipeline to return error");
+    // First failing processor (fail1 in Early stage) will cause pipeline to fail
     match result {
         Err(KreuzbergError::Plugin { message, plugin_name }) => {
             assert_eq!(message, "fail1 error");

data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs CHANGED Viewed

@@ -433,6 +433,7 @@ fn test_postprocessor_error_handling() {
     let result = extract_file_sync(test_file, None, &config);
     // NOTE: Plugin errors now bubble up and fail the extraction (design change)
+    // Other error types (non-IO, non-Plugin) are caught and recorded in metadata
     assert!(
         result.is_err(),
         "Extraction should fail when postprocessor returns Plugin error"

data/vendor/kreuzberg/tests/registry_integration_tests.rs CHANGED Viewed

@@ -15,6 +15,8 @@ use kreuzberg::{KreuzbergError, Result};
 use std::path::Path;
 use std::sync::Arc;
+// ===== Mock Validators =====
 struct MockValidator {
     name: String,
     should_fail: bool,
@@ -85,6 +87,8 @@ impl Validator for FailingInitValidator {
     }
 }
+// ===== Mock Extractors =====
 struct MockExtractor {
     name: String,
     mime_types: Vec<&'static str>,
@@ -142,6 +146,8 @@ impl DocumentExtractor for MockExtractor {
     }
 }
+// ===== Validator Registry Tests =====
 /// Test validator registration and listing.
 #[test]
 fn test_validator_registration_succeeds() {
@@ -274,10 +280,13 @@ fn test_validator_registration_with_failed_init_fails() {
     assert!(result.is_err(), "Registration with failed init should fail");
     match result {
-        Err(KreuzbergError::Plugin { .. }) => {}
+        Err(KreuzbergError::Plugin { .. }) => {
+            // Expected error type
+        }
         _ => panic!("Expected Plugin error"),
     }
+    // Validator should not be in the list
     assert_eq!(registry.list().len(), 0, "Failed validator should not be registered");
 }
@@ -286,6 +295,7 @@ fn test_validator_registration_with_failed_init_fails() {
 fn test_clear_validators_succeeds() {
     let mut registry = ValidatorRegistry::new();
+    // Register multiple validators
     let v1 = Arc::new(MockValidator {
         name: "validator-1".to_string(),
         should_fail: false,
@@ -299,6 +309,7 @@ fn test_clear_validators_succeeds() {
     registry.register(v2).unwrap();
     assert_eq!(registry.list().len(), 2);
+    // Clear all
     let result = registry.shutdown_all();
     assert!(result.is_ok(), "Clear should succeed");
     assert_eq!(registry.list().len(), 0, "Registry should be empty after clear");
@@ -359,11 +370,14 @@ fn test_get_all_validators_respects_priority() {
     let all = registry.get_all();
     assert_eq!(all.len(), 3, "Should have three validators");
+    // Should be in descending priority order
     assert_eq!(all[0].name(), "high-priority");
     assert_eq!(all[1].name(), "medium-priority");
     assert_eq!(all[2].name(), "low-priority");
 }
+// ===== Extractor Registry Tests =====
 /// Test extractor registration and retrieval.
 #[test]
 fn test_extractor_registration_succeeds() {
@@ -437,6 +451,7 @@ fn test_extractor_priority_selection() {
     registry.register(low_priority).unwrap();
     registry.register(high_priority).unwrap();
+    // Should get the high priority extractor
     let result = registry.get("text/plain").unwrap();
     assert_eq!(
         result.name(),
@@ -458,14 +473,17 @@ fn test_extractor_wildcard_mime_matching() {
     registry.register(extractor).unwrap();
+    // Should match text/plain
     let result = registry.get("text/plain");
     assert!(result.is_ok(), "Should match text/plain with text/*");
     assert_eq!(result.unwrap().name(), "text-extractor");
+    // Should match text/html
     let result = registry.get("text/html");
     assert!(result.is_ok(), "Should match text/html with text/*");
     assert_eq!(result.unwrap().name(), "text-extractor");
+    // Should not match application/pdf
     let result = registry.get("application/pdf");
     assert!(result.is_err(), "Should not match application/pdf with text/*");
 }
@@ -488,6 +506,7 @@ fn test_extractor_unregistration_succeeds() {
     assert!(result.is_ok(), "Unregistration should succeed");
     assert_eq!(registry.list().len(), 0, "Registry should be empty after removal");
+    // Should no longer find extractor for MIME type
     let lookup_result = registry.get("text/plain");
     assert!(lookup_result.is_err(), "Should not find extractor after removal");
 }
@@ -505,10 +524,12 @@ fn test_extractor_multiple_mime_types() {
     registry.register(extractor).unwrap();
+    // Should find for all MIME types
     assert!(registry.get("application/pdf").is_ok());
     assert!(registry.get("application/vnd.ms-excel").is_ok());
     assert!(registry.get("text/csv").is_ok());
+    // All should return the same extractor
     assert_eq!(
         registry.get("application/pdf").unwrap().name(),
         "multi-format-extractor"

data/vendor/kreuzberg/tests/security_validation.rs CHANGED Viewed

@@ -12,17 +12,6 @@ use kreuzberg::core::extractor::{extract_bytes_sync, extract_file_sync};
 use std::io::Write;
 use tempfile::NamedTempFile;
-fn trim_trailing_newlines(value: &str) -> &str {
-    value.trim_end_matches(['\n', '\r'])
-}
-fn assert_text_content(actual: &str, expected: &str) {
-    assert_eq!(
-        trim_trailing_newlines(actual),
-        expected,
-        "Content mismatch after trimming trailing newlines"
-    );
-}
 #[test]
 fn test_archive_zip_bomb_detection() {
     let mut cursor = std::io::Cursor::new(Vec::new());
@@ -277,7 +266,7 @@ fn test_resource_single_byte_file() {
     assert!(result.is_ok());
     if let Ok(extracted) = result {
-        assert_text_content(&extracted.content, "a");
+        assert_eq!(extracted.content, "a");
     }
 }