RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +11 -11
data/README.md +5 -10
data/examples/async_patterns.rb +0 -1
data/ext/kreuzberg_rb/extconf.rb +0 -10
data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
data/ext/kreuzberg_rb/native/build.rs +2 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
data/kreuzberg.gemspec +14 -57
data/lib/kreuzberg/cache_api.rb +0 -1
data/lib/kreuzberg/cli.rb +2 -2
data/lib/kreuzberg/config.rb +2 -9
data/lib/kreuzberg/errors.rb +7 -75
data/lib/kreuzberg/extraction_api.rb +0 -1
data/lib/kreuzberg/setup_lib_path.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -21
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +3 -55
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/spec_helper.rb +1 -1
data/vendor/kreuzberg/Cargo.toml +42 -112
data/vendor/kreuzberg/README.md +2 -2
data/vendor/kreuzberg/build.rs +4 -18
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/extractor.rs +81 -202
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +1 -4
data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
data/vendor/kreuzberg/src/embeddings.rs +16 -125
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extraction/image.rs +13 -13
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
data/vendor/kreuzberg/src/extractors/email.rs +0 -14
data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
data/vendor/kreuzberg/src/extractors/html.rs +154 -137
data/vendor/kreuzberg/src/extractors/image.rs +4 -7
data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
data/vendor/kreuzberg/src/extractors/text.rs +5 -23
data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/lib.rs +1 -4
data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
data/vendor/kreuzberg/src/mcp/server.rs +3 -5
data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
data/vendor/kreuzberg/src/pdf/error.rs +1 -1
data/vendor/kreuzberg/src/pdf/table.rs +44 -17
data/vendor/kreuzberg/src/pdf/text.rs +3 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/types.rs +12 -42
data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
data/vendor/kreuzberg/tests/config_features.rs +0 -18
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
data/vendor/kreuzberg/tests/core_integration.rs +7 -24
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -12
metadata +25 -90
data/.rubocop.yml +0 -538
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
data/lib/kreuzberg/error_context.rb +0 -32
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/LICENSE-MIT +0 -21
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/vendor/kreuzberg/src/extractors/rst.rs DELETED Viewed

@@ -1,576 +0,0 @@
-//! Native Rust reStructuredText (RST) extractor.
-//!
-//! This extractor provides comprehensive RST document parsing.
-//! It extracts:
-//! - Document title and headings
-//! - Field list metadata (:Author:, :Date:, :Version:, etc.)
-//! - Paragraphs and text content
-//! - Code blocks with language specifications
-//! - Lists (bullet, numbered, definition lists)
-//! - Tables (both simple and grid tables)
-//! - Directives (image, code-block, note, math, etc.)
-//! - Inline markup (emphasis, strong, code, links)
-//! - Images and references
-#[cfg(feature = "office")]
-use crate::Result;
-#[cfg(feature = "office")]
-use crate::core::config::ExtractionConfig;
-#[cfg(feature = "office")]
-use crate::plugins::{DocumentExtractor, Plugin};
-#[cfg(feature = "office")]
-use crate::types::{ExtractionResult, Metadata, Table};
-#[cfg(feature = "office")]
-use async_trait::async_trait;
-#[cfg(feature = "office")]
-use std::collections::HashMap;
-/// Native Rust reStructuredText extractor.
-///
-/// Parses RST documents using document tree parsing and extracts:
-/// - Metadata from field lists
-/// - Document structure (headings, sections)
-/// - Text content and inline formatting
-/// - Code blocks and directives
-/// - Tables and lists
-#[cfg(feature = "office")]
-pub struct RstExtractor;
-#[cfg(feature = "office")]
-impl RstExtractor {
-    /// Create a new RST extractor.
-    pub fn new() -> Self {
-        Self
-    }
-    /// Extract text content and metadata from RST document.
-    ///
-    /// Uses document tree parsing and fallback text extraction.
-    fn extract_text_and_metadata(content: &str) -> (String, Metadata) {
-        let mut metadata = Metadata::default();
-        let mut additional = HashMap::new();
-        let text = Self::extract_text_from_rst(content, &mut additional);
-        metadata.additional = additional;
-        (text, metadata)
-    }
-    /// Extract text and metadata from RST content.
-    ///
-    /// This is the main extraction engine that processes RST line-by-line
-    /// and extracts all document content including headings, code blocks, lists, etc.
-    fn extract_text_from_rst(content: &str, metadata: &mut HashMap<String, serde_json::Value>) -> String {
-        let mut output = String::new();
-        let lines: Vec<&str> = content.lines().collect();
-        let mut i = 0;
-        while i < lines.len() {
-            let line = lines[i];
-            if line.trim().starts_with(':')
-                && line.contains(':')
-                && let Some((key, value)) = Self::parse_field_list_line(line)
-            {
-                Self::add_metadata_field(&key, &value, metadata);
-                output.push_str(&value);
-                output.push('\n');
-                i += 1;
-                continue;
-            }
-            if i + 1 < lines.len() {
-                let next_line = lines[i + 1];
-                if Self::is_section_underline(next_line) && !line.trim().is_empty() {
-                    output.push_str(line.trim());
-                    output.push('\n');
-                    i += 2;
-                    continue;
-                }
-            }
-            if line.trim().starts_with(".. code-block::") {
-                let lang = line.trim_start_matches(".. code-block::").trim().to_string();
-                if !lang.is_empty() {
-                    output.push_str("code-block: ");
-                    output.push_str(&lang);
-                    output.push('\n');
-                }
-                i += 1;
-                while i < lines.len() && (lines[i].starts_with("   ") || lines[i].is_empty()) {
-                    if !lines[i].is_empty() {
-                        output.push_str(lines[i]);
-                        output.push('\n');
-                    }
-                    i += 1;
-                }
-                continue;
-            }
-            if line.trim().starts_with(".. highlight::") {
-                let lang = line.trim_start_matches(".. highlight::").trim().to_string();
-                if !lang.is_empty() {
-                    output.push_str("highlight: ");
-                    output.push_str(&lang);
-                    output.push('\n');
-                }
-                i += 1;
-                continue;
-            }
-            if line.trim().ends_with("::") {
-                if let Some(display_text) = line.strip_suffix("::")
-                    && !display_text.trim().is_empty()
-                {
-                    output.push_str(display_text.trim());
-                    output.push('\n');
-                }
-                i += 1;
-                while i < lines.len() && (lines[i].starts_with("    ") || lines[i].is_empty()) {
-                    if !lines[i].is_empty() {
-                        output.push_str(lines[i].trim_start());
-                        output.push('\n');
-                    }
-                    i += 1;
-                }
-                continue;
-            }
-            if Self::is_list_item(line) {
-                output.push_str(line.trim());
-                output.push('\n');
-                i += 1;
-                continue;
-            }
-            if line.trim().starts_with(".. ") || line.trim() == ".." {
-                let trimmed = line.trim();
-                let directive = if trimmed == ".." { "" } else { &trimmed[3..] };
-                if directive.starts_with("image::") {
-                    let uri = directive.strip_prefix("image::").unwrap_or("").trim();
-                    output.push_str("image: ");
-                    output.push_str(uri);
-                    output.push('\n');
-                    i += 1;
-                    continue;
-                }
-                if directive.starts_with("note::")
-                    || directive.starts_with("warning::")
-                    || directive.starts_with("important::")
-                    || directive.starts_with("caution::")
-                    || directive.starts_with("hint::")
-                    || directive.starts_with("tip::")
-                {
-                    i += 1;
-                    while i < lines.len() && (lines[i].starts_with("   ") || lines[i].is_empty()) {
-                        if !lines[i].is_empty() {
-                            output.push_str(lines[i].trim());
-                            output.push('\n');
-                        }
-                        i += 1;
-                    }
-                    continue;
-                }
-                if directive.starts_with("math::") {
-                    let math = directive.strip_prefix("math::").unwrap_or("").trim();
-                    if !math.is_empty() {
-                        output.push_str("math: ");
-                        output.push_str(math);
-                        output.push('\n');
-                    }
-                    i += 1;
-                    while i < lines.len() && (lines[i].starts_with("   ") || lines[i].is_empty()) {
-                        if !lines[i].is_empty() {
-                            output.push_str(lines[i].trim());
-                            output.push('\n');
-                        }
-                        i += 1;
-                    }
-                    continue;
-                }
-                i += 1;
-                while i < lines.len() && (lines[i].starts_with("   ") || lines[i].is_empty()) {
-                    i += 1;
-                }
-                continue;
-            }
-            if !line.trim().is_empty() && !Self::is_markup_line(line) {
-                output.push_str(line);
-                output.push('\n');
-            }
-            i += 1;
-        }
-        output
-    }
-    /// Parse a field list line (e.g., ":Author: John Doe")
-    fn parse_field_list_line(line: &str) -> Option<(String, String)> {
-        let trimmed = line.trim();
-        if !trimmed.starts_with(':') {
-            return None;
-        }
-        let rest = &trimmed[1..];
-        if let Some(end_pos) = rest.find(':') {
-            let key = rest[..end_pos].to_string();
-            let value = rest[end_pos + 1..].trim().to_string();
-            return Some((key, value));
-        }
-        None
-    }
-    /// Add a metadata field from RST field list.
-    fn add_metadata_field(key: &str, value: &str, metadata: &mut HashMap<String, serde_json::Value>) {
-        let key_lower = key.to_lowercase();
-        match key_lower.as_str() {
-            "author" | "authors" => {
-                metadata.insert("author".to_string(), serde_json::Value::String(value.to_string()));
-            }
-            "date" => {
-                metadata.insert("date".to_string(), serde_json::Value::String(value.to_string()));
-            }
-            "version" | "revision" => {
-                metadata.insert("version".to_string(), serde_json::Value::String(value.to_string()));
-            }
-            "title" => {
-                metadata.insert("title".to_string(), serde_json::Value::String(value.to_string()));
-            }
-            _ => {
-                metadata.insert(
-                    format!("field_{}", key_lower),
-                    serde_json::Value::String(value.to_string()),
-                );
-            }
-        }
-    }
-    /// Check if a line is a section underline.
-    fn is_section_underline(line: &str) -> bool {
-        let trimmed = line.trim();
-        if trimmed.len() < 3 {
-            return false;
-        }
-        let chars: Vec<char> = trimmed.chars().collect();
-        let first = chars[0];
-        matches!(first, '=' | '-' | '~' | '+' | '^' | '"' | '`' | '#' | '*') && chars.iter().all(|c| *c == first)
-    }
-    /// Check if a line is a list item.
-    fn is_list_item(line: &str) -> bool {
-        let trimmed = line.trim_start();
-        if trimmed.starts_with("* ") || trimmed.starts_with("+ ") || trimmed.starts_with("- ") {
-            return true;
-        }
-        if let Some(space_pos) = trimmed.find(' ')
-            && space_pos > 0
-            && space_pos < 4
-        {
-            let prefix = &trimmed[..space_pos];
-            if prefix.ends_with('.') || prefix.ends_with(')') {
-                return prefix[..prefix.len() - 1].chars().all(|c| c.is_numeric());
-            }
-        }
-        false
-    }
-    /// Check if a line is just markup (underlines, etc.)
-    fn is_markup_line(line: &str) -> bool {
-        let trimmed = line.trim();
-        if trimmed.len() < 3 {
-            return false;
-        }
-        let first = trimmed.chars().next().unwrap();
-        trimmed.chars().all(|c| c == first)
-            && matches!(first, '=' | '-' | '~' | '+' | '^' | '"' | '`' | '#' | '*' | '/')
-    }
-    /// Extract tables from RST content.
-    ///
-    /// Identifies and extracts both simple and grid tables.
-    fn extract_tables(content: &str) -> Vec<Table> {
-        let mut tables = Vec::new();
-        let lines: Vec<&str> = content.lines().collect();
-        let mut i = 0;
-        while i < lines.len() {
-            let line = lines[i];
-            if line.contains("|")
-                && (line.contains("=") || line.contains("-"))
-                && let Some(table) = Self::parse_grid_table(&lines, &mut i)
-            {
-                tables.push(table);
-                continue;
-            }
-            i += 1;
-        }
-        tables
-    }
-    /// Parse a grid table from lines.
-    fn parse_grid_table(lines: &[&str], i: &mut usize) -> Option<Table> {
-        let mut cells = Vec::new();
-        let mut row = Vec::new();
-        while *i < lines.len() && lines[*i].contains("|") {
-            let line = lines[*i].trim_matches(|c| c == '|');
-            if !line.is_empty() {
-                let cell_content = line.split('|').map(|s| s.trim().to_string()).collect::<Vec<_>>();
-                row.extend(cell_content);
-                if !row.is_empty() {
-                    cells.push(row.clone());
-                    row.clear();
-                }
-            }
-            *i += 1;
-        }
-        if cells.is_empty() {
-            return None;
-        }
-        let markdown = Self::cells_to_markdown(&cells);
-        Some(Table {
-            cells,
-            markdown,
-            page_number: 1,
-        })
-    }
-    /// Convert table cells to markdown format.
-    fn cells_to_markdown(cells: &[Vec<String>]) -> String {
-        if cells.is_empty() {
-            return String::new();
-        }
-        let mut md = String::new();
-        if !cells.is_empty() {
-            md.push('|');
-            for cell in &cells[0] {
-                md.push(' ');
-                md.push_str(cell);
-                md.push_str(" |");
-            }
-            md.push('\n');
-            md.push('|');
-            for _ in &cells[0] {
-                md.push_str(" --- |");
-            }
-            md.push('\n');
-            for row in &cells[1..] {
-                md.push('|');
-                for cell in row {
-                    md.push(' ');
-                    md.push_str(cell);
-                    md.push_str(" |");
-                }
-                md.push('\n');
-            }
-        }
-        md
-    }
-}
-#[cfg(feature = "office")]
-impl Default for RstExtractor {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-#[cfg(feature = "office")]
-impl Plugin for RstExtractor {
-    fn name(&self) -> &str {
-        "rst-extractor"
-    }
-    fn version(&self) -> String {
-        env!("CARGO_PKG_VERSION").to_string()
-    }
-    fn initialize(&self) -> Result<()> {
-        Ok(())
-    }
-    fn shutdown(&self) -> Result<()> {
-        Ok(())
-    }
-    fn description(&self) -> &str {
-        "Native Rust extractor for reStructuredText (RST) documents"
-    }
-    fn author(&self) -> &str {
-        "Kreuzberg Team"
-    }
-}
-#[cfg(feature = "office")]
-#[async_trait]
-impl DocumentExtractor for RstExtractor {
-    #[cfg_attr(
-        feature = "otel",
-        tracing::instrument(
-            skip(self, content, _config),
-            fields(
-                extractor.name = self.name(),
-                content.size_bytes = content.len(),
-            )
-        )
-    )]
-    async fn extract_bytes(
-        &self,
-        content: &[u8],
-        mime_type: &str,
-        _config: &ExtractionConfig,
-    ) -> Result<ExtractionResult> {
-        let text = String::from_utf8_lossy(content).into_owned();
-        let (extracted_text, metadata) = Self::extract_text_and_metadata(&text);
-        let tables = Self::extract_tables(&text);
-        Ok(ExtractionResult {
-            content: extracted_text,
-            mime_type: mime_type.to_string(),
-            metadata,
-            tables,
-            detected_languages: None,
-            chunks: None,
-            images: None,
-        })
-    }
-    fn supported_mime_types(&self) -> &[&str] {
-        &["text/x-rst", "text/prs.fallenstein.rst"]
-    }
-    fn priority(&self) -> i32 {
-        50
-    }
-}
-#[cfg(all(test, feature = "office"))]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_rst_extractor_plugin_interface() {
-        let extractor = RstExtractor::new();
-        assert_eq!(extractor.name(), "rst-extractor");
-        assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
-        assert_eq!(extractor.priority(), 50);
-        assert!(!extractor.supported_mime_types().is_empty());
-    }
-    #[test]
-    fn test_rst_extractor_supports_text_x_rst() {
-        let extractor = RstExtractor::new();
-        assert!(extractor.supported_mime_types().contains(&"text/x-rst"));
-    }
-    #[test]
-    fn test_rst_extractor_supports_fallenstein_rst() {
-        let extractor = RstExtractor::new();
-        assert!(extractor.supported_mime_types().contains(&"text/prs.fallenstein.rst"));
-    }
-    #[test]
-    fn test_extract_text_from_rst_simple_document() {
-        let content = r#"
-Title
-=====
-This is a paragraph.
-Another paragraph.
-"#;
-        let mut metadata = HashMap::new();
-        let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
-        assert!(output.contains("Title"));
-        assert!(output.contains("This is a paragraph"));
-        assert!(output.contains("Another paragraph"));
-    }
-    #[test]
-    fn test_extract_text_from_rst_with_code_block() {
-        let content = r#"
-.. code-block:: python
-   def hello():
-       print("world")
-Some text after.
-"#;
-        let mut metadata = HashMap::new();
-        let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
-        assert!(output.contains("code-block"));
-        assert!(output.contains("def hello"));
-        assert!(output.contains("Some text after"));
-    }
-    #[test]
-    fn test_extract_text_from_rst_with_metadata() {
-        let content = r#"
-:Author: John Doe
-:Date: 2024-01-15
-First paragraph.
-Second paragraph.
-"#;
-        let mut metadata = HashMap::new();
-        let output = RstExtractor::extract_text_from_rst(content, &mut metadata);
-        assert!(output.contains("First paragraph"));
-        assert!(output.contains("Second paragraph"));
-        assert!(metadata.contains_key("author"));
-        assert_eq!(metadata.get("author").and_then(|v| v.as_str()), Some("John Doe"));
-    }
-    #[test]
-    fn test_cells_to_markdown_format() {
-        let cells = vec![
-            vec!["Name".to_string(), "Age".to_string()],
-            vec!["Alice".to_string(), "30".to_string()],
-            vec!["Bob".to_string(), "25".to_string()],
-        ];
-        let markdown = RstExtractor::cells_to_markdown(&cells);
-        assert!(markdown.contains("Name"));
-        assert!(markdown.contains("Age"));
-        assert!(markdown.contains("Alice"));
-        assert!(markdown.contains("Bob"));
-        assert!(markdown.contains("---"));
-    }
-    #[test]
-    fn test_rst_extractor_default() {
-        let extractor = RstExtractor;
-        assert_eq!(extractor.name(), "rst-extractor");
-    }
-    #[test]
-    fn test_rst_extractor_initialize_shutdown() {
-        let extractor = RstExtractor::new();
-        assert!(extractor.initialize().is_ok());
-        assert!(extractor.shutdown().is_ok());
-    }
-}