RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +11 -11
data/README.md +5 -10
data/examples/async_patterns.rb +0 -1
data/ext/kreuzberg_rb/extconf.rb +0 -10
data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
data/ext/kreuzberg_rb/native/build.rs +2 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
data/kreuzberg.gemspec +14 -57
data/lib/kreuzberg/cache_api.rb +0 -1
data/lib/kreuzberg/cli.rb +2 -2
data/lib/kreuzberg/config.rb +2 -9
data/lib/kreuzberg/errors.rb +7 -75
data/lib/kreuzberg/extraction_api.rb +0 -1
data/lib/kreuzberg/setup_lib_path.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -21
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +3 -55
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/spec_helper.rb +1 -1
data/vendor/kreuzberg/Cargo.toml +42 -112
data/vendor/kreuzberg/README.md +2 -2
data/vendor/kreuzberg/build.rs +4 -18
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/extractor.rs +81 -202
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +1 -4
data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
data/vendor/kreuzberg/src/embeddings.rs +16 -125
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extraction/image.rs +13 -13
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
data/vendor/kreuzberg/src/extractors/email.rs +0 -14
data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
data/vendor/kreuzberg/src/extractors/html.rs +154 -137
data/vendor/kreuzberg/src/extractors/image.rs +4 -7
data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
data/vendor/kreuzberg/src/extractors/text.rs +5 -23
data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/lib.rs +1 -4
data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
data/vendor/kreuzberg/src/mcp/server.rs +3 -5
data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
data/vendor/kreuzberg/src/pdf/error.rs +1 -1
data/vendor/kreuzberg/src/pdf/table.rs +44 -17
data/vendor/kreuzberg/src/pdf/text.rs +3 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/types.rs +12 -42
data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
data/vendor/kreuzberg/tests/config_features.rs +0 -18
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
data/vendor/kreuzberg/tests/core_integration.rs +7 -24
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -12
metadata +25 -90
data/.rubocop.yml +0 -538
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
data/lib/kreuzberg/error_context.rb +0 -32
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/LICENSE-MIT +0 -21
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/vendor/kreuzberg/src/extractors/jupyter.rs DELETED Viewed

@@ -1,367 +0,0 @@
-//! Jupyter Notebook extractor for .ipynb files.
-//!
-//! This extractor provides native Rust parsing of Jupyter notebooks,
-//! extracting:
-//! - Notebook metadata (kernelspec, language_info, nbformat)
-//! - Cell content (markdown and code cells in order)
-//! - Cell outputs (text, HTML, images)
-//! - Cell metadata (execution_count, tags)
-//!
-//! Requires the `office` feature.
-#[cfg(feature = "office")]
-use crate::Result;
-#[cfg(feature = "office")]
-use crate::core::config::ExtractionConfig;
-#[cfg(feature = "office")]
-use crate::plugins::{DocumentExtractor, Plugin};
-#[cfg(feature = "office")]
-use crate::types::{ExtractionResult, Metadata};
-#[cfg(feature = "office")]
-use async_trait::async_trait;
-#[cfg(feature = "office")]
-use serde_json::{Value, json};
-#[cfg(feature = "office")]
-use std::collections::HashMap;
-/// Jupyter Notebook extractor.
-///
-/// Extracts content from Jupyter notebook JSON files, including:
-/// - Notebook metadata (kernel, language, nbformat version)
-/// - Cell content (code and markdown)
-/// - Cell outputs (text, HTML, etc.)
-/// - Cell-level metadata (tags, execution counts)
-#[cfg(feature = "office")]
-pub struct JupyterExtractor;
-#[cfg(feature = "office")]
-impl JupyterExtractor {
-    /// Create a new Jupyter extractor.
-    pub fn new() -> Self {
-        Self
-    }
-    /// Extract content from a Jupyter notebook.
-    fn extract_notebook(content: &[u8]) -> Result<(String, HashMap<String, Value>)> {
-        let notebook: Value = serde_json::from_slice(content)
-            .map_err(|e| crate::KreuzbergError::parsing(format!("Failed to parse JSON: {}", e)))?;
-        let mut extracted_content = String::new();
-        let mut metadata = HashMap::new();
-        if let Some(notebook_metadata) = notebook.get("metadata").and_then(|m| m.as_object()) {
-            if let Some(kernelspec) = notebook_metadata.get("kernelspec")
-                && let Some(name) = kernelspec.get("name").and_then(|n| n.as_str())
-            {
-                extracted_content.push_str(&format!("Kernelspec: {}\n", name));
-                metadata.insert("kernelspec".to_string(), kernelspec.clone());
-            }
-            if let Some(language_info) = notebook_metadata.get("language_info")
-                && let Some(name) = language_info.get("name").and_then(|n| n.as_str())
-            {
-                extracted_content.push_str(&format!("Language: {}\n", name));
-                metadata.insert("language_info".to_string(), language_info.clone());
-            }
-        }
-        if let Some(nbformat) = notebook.get("nbformat") {
-            extracted_content.push_str(&format!("NBFormat: {}\n", nbformat));
-            metadata.insert("nbformat".to_string(), nbformat.clone());
-        }
-        extracted_content.push('\n');
-        if let Some(cells) = notebook.get("cells").and_then(|c| c.as_array()) {
-            for (cell_idx, cell) in cells.iter().enumerate() {
-                Self::extract_cell(cell, cell_idx, &mut extracted_content, &mut metadata)?;
-            }
-        }
-        Ok((extracted_content, metadata))
-    }
-    /// Extract content from a single cell.
-    fn extract_cell(
-        cell: &Value,
-        cell_idx: usize,
-        content: &mut String,
-        _metadata: &mut HashMap<String, Value>,
-    ) -> Result<()> {
-        let cell_type = cell.get("cell_type").and_then(|t| t.as_str()).unwrap_or("unknown");
-        let cell_id = cell.get("id").and_then(|id| id.as_str());
-        if let Some(id) = cell_id {
-            content.push_str(&format!(":::: {{#{} .cell .{}}}\n", id, cell_type));
-        } else {
-            content.push_str(&format!(":::: {{#cell_{} .cell .{}}}\n", cell_idx, cell_type));
-        }
-        if let Some(cell_metadata) = cell.get("metadata").and_then(|m| m.as_object())
-            && let Some(tags) = cell_metadata.get("tags").and_then(|t| t.as_array())
-        {
-            let tag_strs: Vec<String> = tags
-                .iter()
-                .filter_map(|tag| tag.as_str().map(|s| s.to_string()))
-                .collect();
-            if !tag_strs.is_empty() {
-                content.push_str(&format!(" tags=[{}]", tag_strs.join(", ")));
-            }
-        }
-        content.push('\n');
-        match cell_type {
-            "markdown" => Self::extract_markdown_cell(cell, content)?,
-            "code" => Self::extract_code_cell(cell, content)?,
-            "raw" => Self::extract_raw_cell(cell, content)?,
-            _ => {
-                content.push_str(&format!("Unknown cell type: {}\n", cell_type));
-            }
-        }
-        content.push_str("::::\n\n");
-        Ok(())
-    }
-    /// Extract markdown cell content.
-    fn extract_markdown_cell(cell: &Value, content: &mut String) -> Result<()> {
-        if let Some(source) = cell.get("source") {
-            let cell_text = Self::extract_source(source);
-            content.push_str(&cell_text);
-        }
-        Ok(())
-    }
-    /// Extract code cell content and outputs.
-    fn extract_code_cell(cell: &Value, content: &mut String) -> Result<()> {
-        if let Some(exec_count) = cell.get("execution_count")
-            && !exec_count.is_null()
-        {
-            content.push_str(&format!("::: {{execution_count={}}}\n", exec_count));
-        }
-        if let Some(source) = cell.get("source") {
-            let cell_text = Self::extract_source(source);
-            content.push_str("```python\n");
-            content.push_str(&cell_text);
-            content.push_str("```\n");
-        }
-        if let Some(outputs) = cell.get("outputs").and_then(|o| o.as_array()) {
-            for output in outputs {
-                Self::extract_output(output, content)?;
-            }
-        }
-        Ok(())
-    }
-    /// Extract raw cell content.
-    fn extract_raw_cell(cell: &Value, content: &mut String) -> Result<()> {
-        if let Some(source) = cell.get("source") {
-            let cell_text = Self::extract_source(source);
-            content.push_str(&cell_text);
-        }
-        Ok(())
-    }
-    /// Extract source content from various formats.
-    ///
-    /// Source can be either a string or an array of strings.
-    fn extract_source(source: &Value) -> String {
-        match source {
-            Value::String(s) => s.clone(),
-            Value::Array(arr) => arr.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(""),
-            _ => String::new(),
-        }
-    }
-    /// Extract output from a cell.
-    fn extract_output(output: &Value, content: &mut String) -> Result<()> {
-        let output_type = output.get("output_type").and_then(|t| t.as_str()).unwrap_or("unknown");
-        content.push_str(&format!("::: {{.output .{}", output_type));
-        if let Some(exec_count) = output.get("execution_count")
-            && !exec_count.is_null()
-        {
-            content.push_str(&format!(" execution_count={}", exec_count));
-        }
-        content.push_str("}\n");
-        match output_type {
-            "stream" => Self::extract_stream_output(output, content)?,
-            "execute_result" | "display_data" => Self::extract_data_output(output, content)?,
-            "error" => Self::extract_error_output(output, content)?,
-            _ => {
-                content.push_str(&format!("Unknown output type: {}\n", output_type));
-            }
-        }
-        content.push_str(":::\n");
-        Ok(())
-    }
-    /// Extract stream output (stdout, stderr).
-    fn extract_stream_output(output: &Value, content: &mut String) -> Result<()> {
-        if let Some(name) = output.get("name").and_then(|n| n.as_str()) {
-            content.push_str(&format!("Stream: {}\n", name));
-        }
-        if let Some(text) = output.get("text") {
-            let text_content = Self::extract_source(text);
-            content.push_str(&text_content);
-        }
-        Ok(())
-    }
-    /// Extract data output (execute_result or display_data).
-    fn extract_data_output(output: &Value, content: &mut String) -> Result<()> {
-        if let Some(data) = output.get("data").and_then(|d| d.as_object()) {
-            let mime_types = vec![
-                "text/markdown",
-                "text/html",
-                "image/svg+xml",
-                "image/png",
-                "image/jpeg",
-                "application/json",
-                "text/plain",
-            ];
-            for mime_type in mime_types {
-                if let Some(mime_content) = data.get(mime_type) {
-                    content.push_str(&format!("MIME: {}\n", mime_type));
-                    let mime_text = Self::extract_source(mime_content);
-                    if !mime_text.is_empty() {
-                        content.push_str(&mime_text);
-                        content.push('\n');
-                    }
-                }
-            }
-        }
-        Ok(())
-    }
-    /// Extract error output.
-    fn extract_error_output(output: &Value, content: &mut String) -> Result<()> {
-        if let Some(ename) = output.get("ename").and_then(|e| e.as_str()) {
-            content.push_str(&format!("Error: {}\n", ename));
-        }
-        if let Some(evalue) = output.get("evalue").and_then(|e| e.as_str()) {
-            content.push_str(&format!("Value: {}\n", evalue));
-        }
-        if let Some(traceback) = output.get("traceback").and_then(|t| t.as_array()) {
-            for line in traceback {
-                if let Some(line_str) = line.as_str() {
-                    content.push_str(line_str);
-                    content.push('\n');
-                }
-            }
-        }
-        Ok(())
-    }
-}
-#[cfg(feature = "office")]
-impl Default for JupyterExtractor {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-#[cfg(feature = "office")]
-impl Plugin for JupyterExtractor {
-    fn name(&self) -> &str {
-        "jupyter-extractor"
-    }
-    fn version(&self) -> String {
-        env!("CARGO_PKG_VERSION").to_string()
-    }
-    fn initialize(&self) -> Result<()> {
-        Ok(())
-    }
-    fn shutdown(&self) -> Result<()> {
-        Ok(())
-    }
-    fn description(&self) -> &str {
-        "Extracts content from Jupyter notebooks (.ipynb files)"
-    }
-    fn author(&self) -> &str {
-        "Kreuzberg Team"
-    }
-}
-#[cfg(feature = "office")]
-#[async_trait]
-impl DocumentExtractor for JupyterExtractor {
-    #[cfg_attr(
-        feature = "otel",
-        tracing::instrument(
-            skip(self, content, _config),
-            fields(
-                extractor.name = self.name(),
-                content.size_bytes = content.len(),
-            )
-        )
-    )]
-    async fn extract_bytes(
-        &self,
-        content: &[u8],
-        mime_type: &str,
-        _config: &ExtractionConfig,
-    ) -> Result<ExtractionResult> {
-        let (extracted_content, additional_metadata) = Self::extract_notebook(content)?;
-        let mut metadata_additional = HashMap::new();
-        for (key, value) in additional_metadata {
-            metadata_additional.insert(key, json!(value));
-        }
-        Ok(ExtractionResult {
-            content: extracted_content,
-            mime_type: mime_type.to_string(),
-            metadata: Metadata {
-                additional: metadata_additional,
-                ..Default::default()
-            },
-            tables: vec![],
-            detected_languages: None,
-            chunks: None,
-            images: None,
-        })
-    }
-    fn supported_mime_types(&self) -> &[&str] {
-        &["application/x-ipynb+json"]
-    }
-    fn priority(&self) -> i32 {
-        50
-    }
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_jupyter_extractor_plugin_interface() {
-        let extractor = JupyterExtractor::new();
-        assert_eq!(extractor.name(), "jupyter-extractor");
-        assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
-        assert_eq!(extractor.priority(), 50);
-        assert!(extractor.supported_mime_types().contains(&"application/x-ipynb+json"));
-    }
-}