RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +11 -11
data/README.md +5 -10
data/examples/async_patterns.rb +0 -1
data/ext/kreuzberg_rb/extconf.rb +0 -10
data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
data/ext/kreuzberg_rb/native/build.rs +2 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
data/kreuzberg.gemspec +14 -57
data/lib/kreuzberg/cache_api.rb +0 -1
data/lib/kreuzberg/cli.rb +2 -2
data/lib/kreuzberg/config.rb +2 -9
data/lib/kreuzberg/errors.rb +7 -75
data/lib/kreuzberg/extraction_api.rb +0 -1
data/lib/kreuzberg/setup_lib_path.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -21
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +3 -55
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/spec_helper.rb +1 -1
data/vendor/kreuzberg/Cargo.toml +42 -112
data/vendor/kreuzberg/README.md +2 -2
data/vendor/kreuzberg/build.rs +4 -18
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/extractor.rs +81 -202
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +1 -4
data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
data/vendor/kreuzberg/src/embeddings.rs +16 -125
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extraction/image.rs +13 -13
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
data/vendor/kreuzberg/src/extractors/email.rs +0 -14
data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
data/vendor/kreuzberg/src/extractors/html.rs +154 -137
data/vendor/kreuzberg/src/extractors/image.rs +4 -7
data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
data/vendor/kreuzberg/src/extractors/text.rs +5 -23
data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/lib.rs +1 -4
data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
data/vendor/kreuzberg/src/mcp/server.rs +3 -5
data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
data/vendor/kreuzberg/src/pdf/error.rs +1 -1
data/vendor/kreuzberg/src/pdf/table.rs +44 -17
data/vendor/kreuzberg/src/pdf/text.rs +3 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/types.rs +12 -42
data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
data/vendor/kreuzberg/tests/config_features.rs +0 -18
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
data/vendor/kreuzberg/tests/core_integration.rs +7 -24
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -12
metadata +25 -90
data/.rubocop.yml +0 -538
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
data/lib/kreuzberg/error_context.rb +0 -32
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/LICENSE-MIT +0 -21
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/vendor/kreuzberg/src/core/extractor.rs CHANGED Viewed

@@ -26,65 +26,6 @@ use serde_json::json;
 use std::path::Path;
 use std::sync::Arc;
-/// Record error information in the current OpenTelemetry span.
-///
-/// This function records error details in the current span when the `otel` feature is enabled.
-/// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
-///
-/// # Arguments
-///
-/// * `error` - The error to record in the span
-///
-/// # Example
-///
-/// ```rust,ignore
-/// let result = extract_file("doc.pdf", None, &config).await;
-/// #[cfg(feature = "otel")]
-/// if let Err(ref e) = result {
-///     record_error(e);
-/// }
-/// result
-/// ```
-#[cfg(feature = "otel")]
-fn record_error(error: &KreuzbergError) {
-    let span = tracing::Span::current();
-    span.record("otel.status_code", "ERROR");
-    span.record("error.type", format!("{:?}", error));
-    span.record("error.message", error.to_string());
-}
-/// Sanitize a file path to return only the filename.
-///
-/// This function extracts the filename from a path to avoid recording
-/// potentially sensitive full file paths in telemetry data.
-///
-/// # Arguments
-///
-/// * `path` - The path to sanitize
-///
-/// # Returns
-///
-/// The filename as a string, or "unknown" if extraction fails
-///
-/// # Security
-///
-/// This prevents PII (personally identifiable information) from appearing in
-/// traces by only recording filenames instead of full paths.
-///
-/// # Example
-///
-/// ```rust,ignore
-/// let path = Path::new("/home/user/documents/secret.pdf");
-/// assert_eq!(sanitize_path(path), "secret.pdf");
-/// ```
-#[cfg(feature = "otel")]
-fn sanitize_path(path: &Path) -> String {
-    path.file_name()
-        .and_then(|n| n.to_str())
-        .unwrap_or("unknown")
-        .to_string()
-}
 /// Global Tokio runtime for synchronous operations.
 ///
 /// This runtime is lazily initialized on first use and shared across all sync wrappers.
@@ -160,12 +101,6 @@ fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
 /// # Ok(())
 /// # }
 /// ```
-#[cfg_attr(feature = "otel", tracing::instrument(
-    skip(config, path),
-    fields(
-        extraction.filename = tracing::field::Empty,
-    )
-))]
 pub async fn extract_file(
     path: impl AsRef<Path>,
     mime_type: Option<&str>,
@@ -175,119 +110,86 @@ pub async fn extract_file(
     let path = path.as_ref();
-    #[cfg(feature = "otel")]
-    {
-        let span = tracing::Span::current();
-        span.record("extraction.filename", sanitize_path(path));
-    }
-    let result = async {
-        io::validate_file_exists(path)?;
+    io::validate_file_exists(path)?;
-        let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
+    let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
-        match detected_mime.as_str() {
-            #[cfg(feature = "office")]
-            LEGACY_WORD_MIME_TYPE => {
-                let original_bytes = tokio::fs::read(path).await?;
-                let conversion = convert_doc_to_docx(&original_bytes).await?;
-                let mut result =
-                    extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
-                apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
-                return Ok(result);
-            }
-            #[cfg(not(feature = "office"))]
-            LEGACY_WORD_MIME_TYPE => {
-                return Err(KreuzbergError::UnsupportedFormat(
-                    "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
-                ));
-            }
-            #[cfg(feature = "office")]
-            LEGACY_POWERPOINT_MIME_TYPE => {
-                let original_bytes = tokio::fs::read(path).await?;
-                let conversion = convert_ppt_to_pptx(&original_bytes).await?;
-                let mut result =
-                    extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
-                apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
-                return Ok(result);
-            }
-            #[cfg(not(feature = "office"))]
-            LEGACY_POWERPOINT_MIME_TYPE => {
-                return Err(KreuzbergError::UnsupportedFormat(
-                    "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
-                ));
-            }
-            _ => {}
+    match detected_mime.as_str() {
+        #[cfg(feature = "office")]
+        LEGACY_WORD_MIME_TYPE => {
+            let original_bytes = tokio::fs::read(path).await?;
+            let conversion = convert_doc_to_docx(&original_bytes).await?;
+            let mut result =
+                extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
+            apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
+            return Ok(result);
         }
-        extract_file_with_extractor(path, &detected_mime, config).await
-    }
-    .await;
-    #[cfg(feature = "otel")]
-    if let Err(ref e) = result {
-        record_error(e);
+        #[cfg(not(feature = "office"))]
+        LEGACY_WORD_MIME_TYPE => {
+            return Err(KreuzbergError::UnsupportedFormat(
+                "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
+            ));
+        }
+        #[cfg(feature = "office")]
+        LEGACY_POWERPOINT_MIME_TYPE => {
+            let original_bytes = tokio::fs::read(path).await?;
+            let conversion = convert_ppt_to_pptx(&original_bytes).await?;
+            let mut result =
+                extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
+            apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
+            return Ok(result);
+        }
+        #[cfg(not(feature = "office"))]
+        LEGACY_POWERPOINT_MIME_TYPE => {
+            return Err(KreuzbergError::UnsupportedFormat(
+                "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
+            ));
+        }
+        _ => {}
     }
-    result
+    extract_file_with_extractor(path, &detected_mime, config).await
 }
 /// Extract content from a byte array.
-#[cfg_attr(feature = "otel", tracing::instrument(
-    skip(config, content),
-    fields(
-        extraction.mime_type = mime_type,
-        extraction.size_bytes = content.len(),
-    )
-))]
 pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
     use crate::core::mime;
-    let result = async {
-        let validated_mime = mime::validate_mime_type(mime_type)?;
-        match validated_mime.as_str() {
-            #[cfg(feature = "office")]
-            LEGACY_WORD_MIME_TYPE => {
-                let conversion = convert_doc_to_docx(content).await?;
-                let mut result =
-                    extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
-                apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
-                return Ok(result);
-            }
-            #[cfg(not(feature = "office"))]
-            LEGACY_WORD_MIME_TYPE => {
-                return Err(KreuzbergError::UnsupportedFormat(
-                    "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
-                ));
-            }
-            #[cfg(feature = "office")]
-            LEGACY_POWERPOINT_MIME_TYPE => {
-                let conversion = convert_ppt_to_pptx(content).await?;
-                let mut result =
-                    extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
-                apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
-                return Ok(result);
-            }
-            #[cfg(not(feature = "office"))]
-            LEGACY_POWERPOINT_MIME_TYPE => {
-                return Err(KreuzbergError::UnsupportedFormat(
-                    "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
-                ));
-            }
-            _ => {}
-        }
-        extract_bytes_with_extractor(content, &validated_mime, config).await
-    }
-    .await;
+    let validated_mime = mime::validate_mime_type(mime_type)?;
-    #[cfg(feature = "otel")]
-    if let Err(ref e) = result {
-        record_error(e);
+    match validated_mime.as_str() {
+        #[cfg(feature = "office")]
+        LEGACY_WORD_MIME_TYPE => {
+            let conversion = convert_doc_to_docx(content).await?;
+            let mut result =
+                extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
+            apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
+            return Ok(result);
+        }
+        #[cfg(not(feature = "office"))]
+        LEGACY_WORD_MIME_TYPE => {
+            return Err(KreuzbergError::UnsupportedFormat(
+                "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
+            ));
+        }
+        #[cfg(feature = "office")]
+        LEGACY_POWERPOINT_MIME_TYPE => {
+            let conversion = convert_ppt_to_pptx(content).await?;
+            let mut result =
+                extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
+            apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
+            return Ok(result);
+        }
+        #[cfg(not(feature = "office"))]
+        LEGACY_POWERPOINT_MIME_TYPE => {
+            return Err(KreuzbergError::UnsupportedFormat(
+                "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
+            ));
+        }
+        _ => {}
     }
-    result
+    extract_bytes_with_extractor(content, &validated_mime, config).await
 }
 /// Extract content from multiple files concurrently.
@@ -310,13 +212,6 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
 ///
 /// Individual file errors are captured in the result metadata. System errors
 /// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
-#[cfg_attr(feature = "otel", tracing::instrument(
-    skip(config, paths),
-    fields(
-        extraction.batch_size = paths.len(),
-    )
-))]
-#[cfg(feature = "tokio-runtime")]
 pub async fn batch_extract_file(
     paths: Vec<impl AsRef<Path>>,
     config: &ExtractionConfig,
@@ -407,13 +302,6 @@ pub async fn batch_extract_file(
 /// # Returns
 ///
 /// A vector of `ExtractionResult` in the same order as the input.
-#[cfg_attr(feature = "otel", tracing::instrument(
-    skip(config, contents),
-    fields(
-        extraction.batch_size = contents.len(),
-    )
-))]
-#[cfg(feature = "tokio-runtime")]
 pub async fn batch_extract_bytes(
     contents: Vec<(&[u8], &str)>,
     config: &ExtractionConfig,
@@ -592,10 +480,6 @@ mod tests {
     use std::io::Write;
     use tempfile::tempdir;
-    fn assert_text_content(actual: &str, expected: &str) {
-        assert_eq!(actual.trim_end_matches('\n'), expected);
-    }
     #[tokio::test]
     async fn test_extract_file_basic() {
         let dir = tempdir().unwrap();
@@ -608,7 +492,7 @@ mod tests {
         assert!(result.is_ok());
         let result = result.unwrap();
-        assert_text_content(&result.content, "Hello, world!");
+        assert_eq!(result.content, "Hello, world!");
         assert_eq!(result.mime_type, "text/plain");
     }
@@ -641,7 +525,7 @@ mod tests {
         assert!(result.is_ok());
         let result = result.unwrap();
-        assert_text_content(&result.content, "test content");
+        assert_eq!(result.content, "test content");
         assert_eq!(result.mime_type, "text/plain");
     }
@@ -669,8 +553,8 @@ mod tests {
         assert!(results.is_ok());
         let results = results.unwrap();
         assert_eq!(results.len(), 2);
-        assert_text_content(&results[0].content, "content 1");
-        assert_text_content(&results[1].content, "content 2");
+        assert_eq!(results[0].content, "content 1");
+        assert_eq!(results[1].content, "content 2");
     }
     #[tokio::test]
@@ -695,8 +579,8 @@ mod tests {
         assert!(results.is_ok());
         let results = results.unwrap();
         assert_eq!(results.len(), 2);
-        assert_text_content(&results[0].content, "content 1");
-        assert_text_content(&results[1].content, "content 2");
+        assert_eq!(results[0].content, "content 1");
+        assert_eq!(results[1].content, "content 2");
     }
     #[test]
@@ -709,8 +593,7 @@ mod tests {
         let result = extract_file_sync(&file_path, None, &config);
         assert!(result.is_ok());
-        let result = result.unwrap();
-        assert_text_content(&result.content, "sync test");
+        assert_eq!(result.unwrap().content, "sync test");
         let result = extract_bytes_sync(b"test", "text/plain", &config);
         assert!(result.is_ok());
@@ -722,14 +605,12 @@ mod tests {
         let result1 = extract_bytes(b"test 1", "text/plain", &config).await;
         assert!(result1.is_ok());
-        let result1 = result1.unwrap();
         let result2 = extract_bytes(b"test 2", "text/plain", &config).await;
         assert!(result2.is_ok());
-        let result2 = result2.unwrap();
-        assert_text_content(&result1.content, "test 1");
-        assert_text_content(&result2.content, "test 2");
+        assert_eq!(result1.unwrap().content, "test 1");
+        assert_eq!(result2.unwrap().content, "test 2");
         let result3 = extract_bytes(b"# test 3", "text/markdown", &config).await;
         assert!(result3.is_ok());
@@ -795,8 +676,7 @@ mod tests {
         let result = extract_file(&file_path, None, &config).await;
         assert!(result.is_ok());
-        let result = result.unwrap();
-        assert_text_content(&result.content, "content");
+        assert_eq!(result.unwrap().content, "content");
     }
     #[tokio::test]
@@ -836,7 +716,7 @@ mod tests {
         assert!(results.is_ok());
         let results = results.unwrap();
         assert_eq!(results.len(), 2);
-        assert_text_content(&results[0].content, "valid content");
+        assert_eq!(results[0].content, "valid content");
         assert!(results[1].metadata.error.is_some());
     }
@@ -853,9 +733,9 @@ mod tests {
         assert!(results.is_ok());
         let results = results.unwrap();
         assert_eq!(results.len(), 3);
-        assert_text_content(&results[0].content, "valid 1");
+        assert_eq!(results[0].content, "valid 1");
         assert!(results[1].metadata.error.is_some());
-        assert_text_content(&results[2].content, "valid 2");
+        assert_eq!(results[2].content, "valid 2");
     }
     #[tokio::test]
@@ -882,8 +762,7 @@ mod tests {
         assert!(result.is_ok());
         let result = result.unwrap();
-        let trimmed_len = result.content.trim_end_matches('\n').len();
-        assert_eq!(trimmed_len, 10_000_000);
+        assert_eq!(result.content.len(), 10_000_000);
     }
     #[tokio::test]
@@ -908,7 +787,7 @@ mod tests {
         assert_eq!(results.len(), 100);
         for (i, result) in results.iter().enumerate() {
-            assert_text_content(&result.content, &format!("content {}", i));
+            assert_eq!(result.content, format!("content {}", i));
         }
     }

data/vendor/kreuzberg/src/core/io.rs CHANGED Viewed

@@ -4,6 +4,7 @@
 use crate::{KreuzbergError, Result};
 use std::path::Path;
+use tokio::fs;
 /// Read a file asynchronously.
 ///
@@ -18,9 +19,8 @@ use std::path::Path;
 /// # Errors
 ///
 /// Returns `KreuzbergError::Io` for I/O errors (these always bubble up).
-#[cfg(feature = "tokio-runtime")]
 pub async fn read_file_async(path: impl AsRef<Path>) -> Result<Vec<u8>> {
-    tokio::fs::read(path.as_ref()).await.map_err(KreuzbergError::Io)
+    fs::read(path.as_ref()).await.map_err(KreuzbergError::Io)
 }
 /// Read a file synchronously.
@@ -181,7 +181,6 @@ mod tests {
     use std::io::Write;
     use tempfile::tempdir;
-    #[cfg(feature = "tokio-runtime")]
     #[tokio::test]
     async fn test_read_file_async() {
         let dir = tempdir().unwrap();
@@ -312,7 +311,6 @@ mod tests {
         assert!(result.is_err());
     }
-    #[cfg(feature = "tokio-runtime")]
     #[tokio::test]
     async fn test_read_file_async_io_error() {
         let result = read_file_async("/nonexistent/file.txt").await;

data/vendor/kreuzberg/src/core/mime.rs CHANGED Viewed

@@ -152,7 +152,6 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
     set.insert("application/x-ipynb+json");
     set.insert("application/x-jats+xml");
     set.insert("application/x-latex");
-    set.insert("application/xml+opml");
     set.insert("application/x-opml+xml");
     set.insert("application/x-research-info-systems");
     set.insert("application/x-typst");
@@ -165,7 +164,6 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
     set.insert("text/x-markdown-extra");
     set.insert("text/x-mdoc");
     set.insert("text/x-multimarkdown");
-    set.insert("text/x-opml");
     set.insert("text/x-org");
     set.insert("text/x-pod");
     set.insert("text/x-rst");
@@ -329,35 +327,43 @@ pub fn detect_or_validate(path: Option<&Path>, mime_type: Option<&str>) -> Resul
 ///
 /// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
 pub fn detect_mime_type_from_bytes(content: &[u8]) -> Result<String> {
+    // Use infer crate for magic byte detection
     if let Some(kind) = infer::get(content) {
         let mime_type = kind.mime_type();
+        // Validate that it's a supported type
         if SUPPORTED_MIME_TYPES.contains(mime_type) || mime_type.starts_with("image/") {
             return Ok(mime_type.to_string());
         }
     }
+    // Try to detect text-based formats
     if let Ok(text) = std::str::from_utf8(content) {
         let trimmed = text.trim_start();
+        // Detect JSON
         if (trimmed.starts_with('{') || trimmed.starts_with('['))
             && serde_json::from_str::<serde_json::Value>(text).is_ok()
         {
             return Ok(JSON_MIME_TYPE.to_string());
         }
+        // Detect XML
         if trimmed.starts_with("<?xml") || trimmed.starts_with('<') {
             return Ok(XML_MIME_TYPE.to_string());
         }
+        // Detect HTML
         if trimmed.starts_with("<!DOCTYPE html") || trimmed.starts_with("<html") {
             return Ok(HTML_MIME_TYPE.to_string());
         }
+        // Detect PDF header
         if trimmed.starts_with("%PDF") {
             return Ok(PDF_MIME_TYPE.to_string());
         }
+        // Default to plain text for valid UTF-8
         return Ok(PLAIN_TEXT_MIME_TYPE.to_string());
     }
@@ -392,21 +398,25 @@ pub fn detect_mime_type_from_bytes(content: &[u8]) -> Result<String> {
 pub fn get_extensions_for_mime(mime_type: &str) -> Result<Vec<String>> {
     let mut extensions = Vec::new();
+    // Search through EXT_TO_MIME for matching MIME types
     for (ext, mime) in EXT_TO_MIME.iter() {
         if *mime == mime_type {
             extensions.push(ext.to_string());
         }
     }
+    // If we found extensions, return them
     if !extensions.is_empty() {
         return Ok(extensions);
     }
+    // Try using mime_guess crate for reverse lookup
     let guessed = mime_guess::get_mime_extensions_str(mime_type);
     if let Some(exts) = guessed {
         return Ok(exts.iter().map(|s| s.to_string()).collect());
     }
+    // No extensions found
     Err(KreuzbergError::UnsupportedFormat(format!(
         "No known extensions for MIME type: {}",
         mime_type

data/vendor/kreuzberg/src/core/mod.rs CHANGED Viewed

@@ -28,7 +28,6 @@
 //! # }
 //! ```
-#[cfg(feature = "tokio-runtime")]
 pub(crate) mod batch_mode;
 pub mod config;
 pub mod extractor;
@@ -40,6 +39,4 @@ pub use config::{
     ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, PdfConfig,
     TokenReductionConfig,
 };
-#[cfg(feature = "tokio-runtime")]
-pub use extractor::{batch_extract_bytes, batch_extract_file};
-pub use extractor::{extract_bytes, extract_file};
+pub use extractor::{batch_extract_bytes, batch_extract_file, extract_bytes, extract_file};