RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +11 -11
data/README.md +5 -10
data/examples/async_patterns.rb +0 -1
data/ext/kreuzberg_rb/extconf.rb +0 -10
data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
data/ext/kreuzberg_rb/native/build.rs +2 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
data/kreuzberg.gemspec +14 -57
data/lib/kreuzberg/cache_api.rb +0 -1
data/lib/kreuzberg/cli.rb +2 -2
data/lib/kreuzberg/config.rb +2 -9
data/lib/kreuzberg/errors.rb +7 -75
data/lib/kreuzberg/extraction_api.rb +0 -1
data/lib/kreuzberg/setup_lib_path.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -21
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +3 -55
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/spec_helper.rb +1 -1
data/vendor/kreuzberg/Cargo.toml +42 -112
data/vendor/kreuzberg/README.md +2 -2
data/vendor/kreuzberg/build.rs +4 -18
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/extractor.rs +81 -202
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +1 -4
data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
data/vendor/kreuzberg/src/embeddings.rs +16 -125
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extraction/image.rs +13 -13
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
data/vendor/kreuzberg/src/extractors/email.rs +0 -14
data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
data/vendor/kreuzberg/src/extractors/html.rs +154 -137
data/vendor/kreuzberg/src/extractors/image.rs +4 -7
data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
data/vendor/kreuzberg/src/extractors/text.rs +5 -23
data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/lib.rs +1 -4
data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
data/vendor/kreuzberg/src/mcp/server.rs +3 -5
data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
data/vendor/kreuzberg/src/pdf/error.rs +1 -1
data/vendor/kreuzberg/src/pdf/table.rs +44 -17
data/vendor/kreuzberg/src/pdf/text.rs +3 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/types.rs +12 -42
data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
data/vendor/kreuzberg/tests/config_features.rs +0 -18
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
data/vendor/kreuzberg/tests/core_integration.rs +7 -24
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -12
metadata +25 -90
data/.rubocop.yml +0 -538
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
data/lib/kreuzberg/error_context.rb +0 -32
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/LICENSE-MIT +0 -21
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/vendor/kreuzberg/tests/batch_processing.rs CHANGED Viewed

@@ -12,18 +12,6 @@ use std::path::PathBuf;
 mod helpers;
 use helpers::{get_test_documents_dir, get_test_file_path, skip_if_missing, test_documents_available};
-fn trim_trailing_newlines(value: &str) -> &str {
-    value.trim_end_matches(['\n', '\r'])
-}
-fn assert_text_content(actual: &str, expected: &str) {
-    assert_eq!(
-        trim_trailing_newlines(actual),
-        expected,
-        "Content mismatch after trimming trailing newlines"
-    );
-}
 /// Test batch extraction with multiple file formats (PDF, DOCX, TXT).
 #[tokio::test]
 async fn test_batch_extract_file_multiple_formats() {
@@ -134,7 +122,7 @@ async fn test_batch_extract_bytes_multiple() {
     assert_eq!(results.len(), 3);
-    assert_text_content(&results[0].content, "This is plain text content");
+    assert_eq!(results[0].content, "This is plain text content");
     assert_eq!(results[0].mime_type, "text/plain");
     assert!(results[1].content.contains("Markdown Header"));
@@ -310,7 +298,7 @@ fn test_batch_extract_bytes_sync_variant() {
     let results = results.unwrap();
     assert_eq!(results.len(), 3);
-    assert_text_content(&results[0].content, "content 1");
-    assert_text_content(&results[1].content, "content 2");
+    assert_eq!(results[0].content, "content 1");
+    assert_eq!(results[1].content, "content 2");
     assert!(results[2].content.contains("content 3"));
 }

data/vendor/kreuzberg/tests/chunking_offset_demo.rs ADDED Viewed

@@ -0,0 +1,92 @@
+#[cfg(feature = "chunking")]
+#[test]
+fn demonstrate_correct_offset_calculation() {
+    use kreuzberg::chunking::{ChunkerType, ChunkingConfig, chunk_text};
+    println!("\n=== Demonstrating Correct Chunking Offset Calculation ===\n");
+    let config_with_overlap = ChunkingConfig {
+        max_characters: 20,
+        overlap: 5,
+        trim: false,
+        chunker_type: ChunkerType::Text,
+    };
+    let text = "AAAAA BBBBB CCCCC DDDDD EEEEE FFFFF";
+    println!("Text: \"{}\"", text);
+    println!(
+        "Max characters: {}, Overlap: {}\n",
+        config_with_overlap.max_characters, config_with_overlap.overlap
+    );
+    let result = chunk_text(text, &config_with_overlap).unwrap();
+    println!("WITH OVERLAP (5 chars):");
+    for (i, chunk) in result.chunks.iter().enumerate() {
+        println!(
+            "  Chunk {}: [{:3} - {:3}] = \"{}\"",
+            i,
+            chunk.metadata.char_start,
+            chunk.metadata.char_end,
+            chunk.content.replace('\n', "\\n")
+        );
+    }
+    println!("\nOverlap verification:");
+    for i in 0..result.chunks.len() - 1 {
+        let current = &result.chunks[i];
+        let next = &result.chunks[i + 1];
+        let overlap_size = current.metadata.char_end - next.metadata.char_start;
+        println!(
+            "  Chunks {} and {}: overlap = {} chars (next starts at {} while current ends at {})",
+            i,
+            i + 1,
+            overlap_size,
+            next.metadata.char_start,
+            current.metadata.char_end
+        );
+        assert!(
+            overlap_size > 0 && overlap_size <= config_with_overlap.overlap + 10,
+            "Overlap should exist and be reasonable"
+        );
+    }
+    println!("\n\n=== Without Overlap ===\n");
+    let config_no_overlap = ChunkingConfig {
+        max_characters: 20,
+        overlap: 0,
+        trim: false,
+        chunker_type: ChunkerType::Text,
+    };
+    let result_no_overlap = chunk_text(text, &config_no_overlap).unwrap();
+    println!("WITHOUT OVERLAP:");
+    for (i, chunk) in result_no_overlap.chunks.iter().enumerate() {
+        println!(
+            "  Chunk {}: [{:3} - {:3}] = \"{}\"",
+            i,
+            chunk.metadata.char_start,
+            chunk.metadata.char_end,
+            chunk.content.replace('\n', "\\n")
+        );
+    }
+    println!("\nAdjacency verification:");
+    for i in 0..result_no_overlap.chunks.len() - 1 {
+        let current = &result_no_overlap.chunks[i];
+        let next = &result_no_overlap.chunks[i + 1];
+        let gap = next.metadata.char_start as i32 - current.metadata.char_end as i32;
+        println!(
+            "  Chunks {} and {}: gap = {} (next starts at {}, current ends at {})",
+            i,
+            i + 1,
+            gap,
+            next.metadata.char_start,
+            current.metadata.char_end
+        );
+        assert!(gap >= 0, "Should have no overlap (gap >= 0)");
+    }
+    println!("\n✓ All offset calculations are correct!");
+}

data/vendor/kreuzberg/tests/concurrency_stress.rs CHANGED Viewed

@@ -30,18 +30,6 @@ use tokio::time::timeout;
 mod helpers;
-fn trim_trailing_newlines(value: &str) -> &str {
-    value.trim_end_matches(['\n', '\r'])
-}
-fn assert_text_content(actual: &str, expected: &str) {
-    assert_eq!(
-        trim_trailing_newlines(actual),
-        expected,
-        "Content mismatch after trimming trailing newlines"
-    );
-}
 /// Test many concurrent extractions of different MIME types.
 ///
 /// Validates that:
@@ -156,7 +144,7 @@ async fn test_concurrent_extractions_with_cache() {
         let result = handle.await.expect("Task should not panic");
         assert!(result.is_ok(), "Cache read should succeed");
         let extraction = result.unwrap();
-        assert_text_content(&extraction.content, expected_content);
+        assert_eq!(extraction.content, expected_content);
     }
 }
@@ -171,10 +159,6 @@ async fn test_concurrent_extractions_with_cache() {
 async fn test_concurrent_ocr_processing() {
     use helpers::{get_test_file_path, skip_if_missing};
-    if cfg!(windows) {
-        return;
-    }
     if skip_if_missing("images/ocr_image.jpg") {
         tracing::debug!("Skipping concurrent OCR test: test file not available");
         return;

data/vendor/kreuzberg/tests/config_features.rs CHANGED Viewed

@@ -484,15 +484,8 @@ async fn test_quality_processing_disabled() {
 }
 /// Test chunking with embeddings using balanced preset.
-///
-/// This test requires ONNX Runtime to be installed as a system dependency.
-/// On macOS with Homebrew: `brew install onnxruntime`
-/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
-/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
 #[tokio::test]
 #[cfg(feature = "embeddings")]
-#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
-#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
 async fn test_chunking_with_embeddings() {
     use kreuzberg::core::config::EmbeddingConfig;
@@ -550,15 +543,8 @@ async fn test_chunking_with_embeddings() {
 }
 /// Test chunking with fast embedding preset.
-///
-/// This test requires ONNX Runtime to be installed as a system dependency.
-/// On macOS with Homebrew: `brew install onnxruntime`
-/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
-/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
 #[tokio::test]
 #[cfg(feature = "embeddings")]
-#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
-#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
 async fn test_chunking_with_fast_embeddings() {
     use kreuzberg::core::config::{EmbeddingConfig, EmbeddingModelType};
@@ -587,10 +573,6 @@ async fn test_chunking_with_fast_embeddings() {
     let chunks = result.chunks.expect("Should have chunks");
     assert!(!chunks.is_empty(), "Should have at least one chunk");
-    if let Some(error) = result.metadata.additional.get("embedding_error") {
-        panic!("Embedding generation failed: {}", error);
-    }
     for chunk in &chunks {
         let embedding = chunk.embedding.as_ref().expect("Should have embedding");
         assert_eq!(embedding.len(), 384, "Fast preset should produce 384-dim embeddings");

data/vendor/kreuzberg/tests/config_loading_tests.rs CHANGED Viewed

@@ -124,6 +124,7 @@ ocr:
 fn test_from_file_nonexistent_path_fails() {
     let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
     assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
+    // Error can be Io or other types depending on the implementation
 }
 /// Test from_file with malformed TOML fails.
@@ -141,6 +142,7 @@ enabled = true
     let result = ExtractionConfig::from_file(&config_path);
     assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
+    // Error handling varies - just ensure it failed
 }
 /// Test from_file with malformed JSON fails.
@@ -162,6 +164,7 @@ fn test_from_file_malformed_json_fails() {
     let result = ExtractionConfig::from_file(&config_path);
     assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
+    // Error handling varies - just ensure it failed
 }
 /// Test from_file with malformed YAML fails.
@@ -180,6 +183,7 @@ ocr:
     let result = ExtractionConfig::from_file(&config_path);
     assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
+    // Error handling varies - just ensure it failed
 }
 /// Test from_file with empty file uses defaults.
@@ -194,6 +198,7 @@ fn test_from_file_empty_file_uses_defaults() {
     assert!(config.is_ok(), "Should load empty file successfully");
     let config = config.unwrap();
+    // Should have default values
     assert!(config.ocr.is_none(), "Default config should have no OCR");
     assert!(config.chunking.is_none(), "Default config should have no chunking");
 }
@@ -209,18 +214,22 @@ fn test_from_file_unsupported_extension_fails() {
     let result = ExtractionConfig::from_file(&config_path);
     assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
-    if let Err(KreuzbergError::Validation { message, .. }) = result {
-        assert!(
-            message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
-            "Error should mention format/extension: {}",
-            message
-        );
+    match result {
+        Err(KreuzbergError::Validation { message, .. }) => {
+            assert!(
+                message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
+                "Error should mention format/extension: {}",
+                message
+            );
+        }
+        _ => {
+            // Some other error is also acceptable
+        }
     }
 }
 /// Test discover() finds config in current directory.
 #[test]
-#[serial_test::serial]
 fn test_discover_finds_config_in_current_dir() {
     let temp_dir = TempDir::new().unwrap();
     let config_path = temp_dir.path().join("kreuzberg.toml");
@@ -232,11 +241,13 @@ enabled = true
     fs::write(&config_path, toml_content).unwrap();
+    // Change to temp directory
     let original_dir = std::env::current_dir().unwrap();
     std::env::set_current_dir(temp_dir.path()).unwrap();
     let result = ExtractionConfig::discover();
+    // Restore original directory
     std::env::set_current_dir(original_dir).unwrap();
     assert!(result.is_ok(), "Discover should succeed");
@@ -247,7 +258,6 @@ enabled = true
 /// Test discover() finds config in parent directory.
 #[test]
-#[serial_test::serial]
 fn test_discover_finds_config_in_parent_dir() {
     let temp_dir = TempDir::new().unwrap();
     let config_path = temp_dir.path().join("kreuzberg.toml");
@@ -259,14 +269,17 @@ enabled = true
     fs::write(&config_path, toml_content).unwrap();
+    // Create subdirectory
     let sub_dir = temp_dir.path().join("subdir");
     fs::create_dir(&sub_dir).unwrap();
+    // Change to subdirectory
     let original_dir = std::env::current_dir().unwrap();
     std::env::set_current_dir(&sub_dir).unwrap();
     let result = ExtractionConfig::discover();
+    // Restore original directory
     std::env::set_current_dir(original_dir).unwrap();
     assert!(result.is_ok(), "Discover should succeed");
@@ -277,39 +290,44 @@ enabled = true
 /// Test discover() returns None when no config found.
 #[test]
-#[serial_test::serial]
 fn test_discover_returns_none_when_not_found() {
     let temp_dir = TempDir::new().unwrap();
     let sub_dir = temp_dir.path().join("subdir");
     fs::create_dir(&sub_dir).unwrap();
+    // Change to subdirectory (no config files)
     let original_dir = std::env::current_dir().unwrap();
     std::env::set_current_dir(&sub_dir).unwrap();
     let result = ExtractionConfig::discover();
+    // Restore original directory
     std::env::set_current_dir(original_dir).unwrap();
     assert!(result.is_ok(), "Discover should succeed even when no config found");
     let _config = result.unwrap();
+    // May return None or may find a config in parent directories (e.g., repository root)
+    // Just verify it doesn't error - the specific behavior depends on the directory structure
 }
 /// Test discover() prefers certain file names.
 #[test]
-#[serial_test::serial]
 fn test_discover_file_name_preference() {
     let temp_dir = TempDir::new().unwrap();
+    // Create multiple config files
     fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").unwrap();
     fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").unwrap();
     let original_dir = std::env::current_dir().unwrap();
     if std::env::set_current_dir(temp_dir.path()).is_err() {
+        // Skip this test if we can't change directory
         return;
     }
     let result = ExtractionConfig::discover();
+    // Always restore directory even if test fails
     let _ = std::env::set_current_dir(original_dir);
     assert!(result.is_ok(), "Discover should succeed");
@@ -319,7 +337,6 @@ fn test_discover_file_name_preference() {
 /// Test discover() with nested directories.
 #[test]
-#[serial_test::serial]
 fn test_discover_with_nested_directories() {
     let temp_dir = TempDir::new().unwrap();
     let config_path = temp_dir.path().join("kreuzberg.toml");
@@ -331,18 +348,22 @@ enabled = true
     fs::write(&config_path, toml_content).unwrap();
+    // Create nested subdirectories
     let level1 = temp_dir.path().join("level1");
     let level2 = level1.join("level2");
     let level3 = level2.join("level3");
     fs::create_dir_all(&level3).unwrap();
+    // Change to deepest directory
     let original_dir = std::env::current_dir().unwrap();
     if std::env::set_current_dir(&level3).is_err() {
+        // Skip this test if we can't change directory
         return;
     }
     let result = ExtractionConfig::discover();
+    // Always restore directory even if test fails
     let _ = std::env::set_current_dir(&original_dir);
     assert!(result.is_ok(), "Discover should succeed");
@@ -398,6 +419,7 @@ fn test_from_file_with_invalid_values() {
     let temp_dir = TempDir::new().unwrap();
     let config_path = temp_dir.path().join("config.toml");
+    // Negative values should be rejected during deserialization or validation
     let toml_content = r#"
 [chunking]
 max_chars = -1000
@@ -407,9 +429,11 @@ max_overlap = -100
     fs::write(&config_path, toml_content).unwrap();
     let result = ExtractionConfig::from_file(&config_path);
-    if let Ok(config) = result
-        && let Some(chunking) = config.chunking
-    {
-        assert!(chunking.max_chars > 0, "max_chars should be positive");
+    // Should either fail parsing or have clamped values
+    if let Ok(config) = result {
+        // If it succeeds, values should be reasonable
+        if let Some(chunking) = config.chunking {
+            assert!(chunking.max_chars > 0, "max_chars should be positive");
+        }
     }
 }

data/vendor/kreuzberg/tests/core_integration.rs CHANGED Viewed

@@ -11,18 +11,6 @@ use std::fs::{self, File};
 use std::io::Write;
 use tempfile::tempdir;
-fn trim_trailing_newlines(value: &str) -> &str {
-    value.trim_end_matches(['\n', '\r'])
-}
-fn assert_text_content(actual: &str, expected: &str) {
-    assert_eq!(
-        trim_trailing_newlines(actual),
-        expected,
-        "Content mismatch after trimming trailing newlines"
-    );
-}
 /// Test basic file extraction with MIME detection.
 #[tokio::test]
 async fn test_extract_file_basic() {
@@ -37,7 +25,7 @@ async fn test_extract_file_basic() {
     assert!(result.is_ok(), "Basic file extraction should succeed");
     let result = result.unwrap();
-    assert_text_content(&result.content, "Hello, Kreuzberg!");
+    assert_eq!(result.content, "Hello, Kreuzberg!");
     assert_eq!(result.mime_type, "text/plain");
     assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
     assert!(result.detected_languages.is_none(), "Language detection not enabled");
@@ -193,12 +181,7 @@ async fn test_batch_extract_bytes_concurrency() {
     for (i, result) in results.iter().enumerate() {
         let expected_content = format!("content {}", i + 1);
-        assert_eq!(
-            trim_trailing_newlines(&result.content),
-            expected_content,
-            "Content mismatch for item {}",
-            i
-        );
+        assert_eq!(result.content, expected_content, "Content mismatch for item {}", i);
         assert_eq!(result.mime_type, "text/plain", "MIME type should be text/plain");
         assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
         assert!(result.detected_languages.is_none(), "Language detection not enabled");
@@ -218,13 +201,13 @@ fn test_sync_wrappers() {
     let result = extract_file_sync(&file_path, None, &config);
     assert!(result.is_ok(), "Sync file extraction should succeed");
     let extraction = result.unwrap();
-    assert_text_content(&extraction.content, "sync content");
+    assert_eq!(extraction.content, "sync content");
     assert!(extraction.chunks.is_none(), "Chunks should be None");
     let result = extract_bytes_sync(b"test bytes", "text/plain", &config);
     assert!(result.is_ok(), "Sync bytes extraction should succeed");
     let extraction = result.unwrap();
-    assert_text_content(&extraction.content, "test bytes");
+    assert_eq!(extraction.content, "test bytes");
     assert!(extraction.chunks.is_none(), "Chunks should be None");
     let paths = vec![file_path];
@@ -232,7 +215,7 @@ fn test_sync_wrappers() {
     assert!(results.is_ok(), "Batch sync file should succeed");
     let results = results.unwrap();
     assert_eq!(results.len(), 1);
-    assert_text_content(&results[0].content, "sync content");
+    assert_eq!(results[0].content, "sync content");
     assert!(results[0].chunks.is_none(), "Chunks should be None");
     let contents = vec![(b"test".as_slice(), "text/plain")];
@@ -240,7 +223,7 @@ fn test_sync_wrappers() {
     assert!(results.is_ok(), "Batch bytes sync should succeed");
     let results = results.unwrap();
     assert_eq!(results.len(), 1);
-    assert_text_content(&results[0].content, "test");
+    assert_eq!(results[0].content, "test");
     assert!(results[0].chunks.is_none(), "Chunks should be None");
 }
@@ -432,7 +415,7 @@ async fn test_pipeline_execution() {
     assert!(result.is_ok(), "Pipeline execution should succeed");
     let result = result.unwrap();
-    assert_text_content(&result.content, "pipeline content");
+    assert_eq!(result.content, "pipeline content");
     assert_eq!(result.mime_type, "text/plain");
     assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
     assert!(result.detected_languages.is_none(), "Language detection not enabled");