RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +11 -11
data/README.md +5 -10
data/examples/async_patterns.rb +0 -1
data/ext/kreuzberg_rb/extconf.rb +0 -10
data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
data/ext/kreuzberg_rb/native/build.rs +2 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
data/kreuzberg.gemspec +14 -57
data/lib/kreuzberg/cache_api.rb +0 -1
data/lib/kreuzberg/cli.rb +2 -2
data/lib/kreuzberg/config.rb +2 -9
data/lib/kreuzberg/errors.rb +7 -75
data/lib/kreuzberg/extraction_api.rb +0 -1
data/lib/kreuzberg/setup_lib_path.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -21
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +3 -55
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/spec_helper.rb +1 -1
data/vendor/kreuzberg/Cargo.toml +42 -112
data/vendor/kreuzberg/README.md +2 -2
data/vendor/kreuzberg/build.rs +4 -18
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/extractor.rs +81 -202
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +1 -4
data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
data/vendor/kreuzberg/src/embeddings.rs +16 -125
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extraction/image.rs +13 -13
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
data/vendor/kreuzberg/src/extractors/email.rs +0 -14
data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
data/vendor/kreuzberg/src/extractors/html.rs +154 -137
data/vendor/kreuzberg/src/extractors/image.rs +4 -7
data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
data/vendor/kreuzberg/src/extractors/text.rs +5 -23
data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/lib.rs +1 -4
data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
data/vendor/kreuzberg/src/mcp/server.rs +3 -5
data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
data/vendor/kreuzberg/src/pdf/error.rs +1 -1
data/vendor/kreuzberg/src/pdf/table.rs +44 -17
data/vendor/kreuzberg/src/pdf/text.rs +3 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/types.rs +12 -42
data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
data/vendor/kreuzberg/tests/config_features.rs +0 -18
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
data/vendor/kreuzberg/tests/core_integration.rs +7 -24
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -12
metadata +25 -90
data/.rubocop.yml +0 -538
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
data/lib/kreuzberg/error_context.rb +0 -32
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/LICENSE-MIT +0 -21
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/lib/kreuzberg/error_context.rb DELETED Viewed

@@ -1,32 +0,0 @@
-# frozen_string_literal: true
-require 'json'
-module Kreuzberg
-  # ErrorContext module provides access to FFI error introspection functions.
-  # Retrieve the last error code and panic context information from errors.
-  module ErrorContext
-    class << self
-      def last_error_code
-        Kreuzberg._last_error_code_native
-      rescue StandardError
-        0
-      end
-      def last_panic_context
-        json_str = Kreuzberg._last_panic_context_json_native
-        return nil unless json_str
-        Errors::PanicContext.from_json(json_str)
-      rescue StandardError
-        nil
-      end
-      def last_panic_context_json
-        Kreuzberg._last_panic_context_json_native
-      rescue StandardError
-        nil
-      end
-    end
-  end
-end

data/vendor/kreuzberg/benches/otel_overhead.rs DELETED Viewed

@@ -1,48 +0,0 @@
-use criterion::{Criterion, criterion_group, criterion_main};
-use std::hint::black_box;
-fn bench_text_extraction(c: &mut Criterion) {
-    let runtime = tokio::runtime::Runtime::new().unwrap();
-    c.bench_function("extract_text_no_otel", |b| {
-        b.iter(|| {
-            runtime.block_on(async {
-                use kreuzberg::core::config::ExtractionConfig;
-                use kreuzberg::core::extractor::extract_bytes;
-                let test_content = black_box(b"Hello, World! This is a test document.");
-                let config = ExtractionConfig::default();
-                extract_bytes(test_content, "text/plain", &config).await
-            })
-        });
-    });
-}
-fn bench_cache_operations(c: &mut Criterion) {
-    use kreuzberg::cache::GenericCache;
-    use tempfile::tempdir;
-    let temp_dir = tempdir().unwrap();
-    let cache = GenericCache::new(
-        "bench".to_string(),
-        Some(temp_dir.path().to_str().unwrap().to_string()),
-        30.0,
-        500.0,
-        1000.0,
-    )
-    .unwrap();
-    c.bench_function("cache_set_get", |b| {
-        b.iter(|| {
-            let key = black_box("bench_key");
-            let data = black_box(b"benchmark data".to_vec());
-            cache.set(key, data.clone(), None).unwrap();
-            cache.get(key, None).unwrap()
-        });
-    });
-}
-criterion_group!(benches, bench_text_extraction, bench_cache_operations);
-criterion_main!(benches);

data/vendor/kreuzberg/src/extraction/markdown.rs DELETED Viewed

@@ -1,213 +0,0 @@
-//! Markdown table formatting utilities
-//!
-//! This module provides utilities for converting tabular data into GitHub-Flavored Markdown (GFM) tables.
-//! It's used by multiple extractors (DOCX, HTML) that need to represent structured table data in markdown format.
-/// Converts a 2D vector of cell strings into a GitHub-Flavored Markdown table.
-///
-/// # Behavior
-///
-/// - The first row is treated as the header row
-/// - A separator row is inserted after the header
-/// - Pipe characters (`|`) in cell content are automatically escaped with backslash
-/// - Irregular tables (rows with varying column counts) are padded with empty cells to match the header
-/// - Returns an empty string for empty input
-///
-/// # Arguments
-///
-/// * `cells` - A slice of vectors representing table rows, where each inner vector contains cell values
-///
-/// # Returns
-///
-/// A `String` containing the GFM markdown table representation
-///
-/// # Examples
-///
-/// ```
-/// # use kreuzberg::extraction::cells_to_markdown;
-/// let cells = vec![
-///     vec!["Name".to_string(), "Age".to_string()],
-///     vec!["Alice".to_string(), "30".to_string()],
-///     vec!["Bob".to_string(), "25".to_string()],
-/// ];
-///
-/// let markdown = cells_to_markdown(&cells);
-/// assert!(markdown.contains("| Name | Age |"));
-/// assert!(markdown.contains("|------|------|"));
-/// ```
-pub fn cells_to_markdown(cells: &[Vec<String>]) -> String {
-    if cells.is_empty() {
-        return String::new();
-    }
-    let mut markdown = String::new();
-    let num_cols = cells.first().map(|r| r.len()).unwrap_or(0);
-    if num_cols == 0 {
-        return String::new();
-    }
-    if let Some(header) = cells.first() {
-        markdown.push('|');
-        for cell in header {
-            markdown.push(' ');
-            let escaped = cell.replace('|', "\\|");
-            markdown.push_str(&escaped);
-            markdown.push_str(" |");
-        }
-        markdown.push('\n');
-        markdown.push('|');
-        for _ in 0..num_cols {
-            markdown.push_str("------|");
-        }
-        markdown.push('\n');
-    }
-    for row in cells.iter().skip(1) {
-        markdown.push('|');
-        for (idx, cell) in row.iter().enumerate() {
-            if idx >= num_cols {
-                break;
-            }
-            markdown.push(' ');
-            let escaped = cell.replace('|', "\\|");
-            markdown.push_str(&escaped);
-            markdown.push_str(" |");
-        }
-        for _ in row.len()..num_cols {
-            markdown.push_str(" |");
-        }
-        markdown.push('\n');
-    }
-    markdown
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_markdown_formatting_from_simple_table() {
-        let cells = vec![
-            vec!["Header1".to_string(), "Header2".to_string()],
-            vec!["Row1Col1".to_string(), "Row1Col2".to_string()],
-            vec!["Row2Col1".to_string(), "Row2Col2".to_string()],
-        ];
-        let markdown = cells_to_markdown(&cells);
-        assert!(markdown.contains("| Header1 | Header2 |"));
-        assert!(markdown.contains("|------|------|"));
-        assert!(markdown.contains("| Row1Col1 | Row1Col2 |"));
-        assert!(markdown.contains("| Row2Col1 | Row2Col2 |"));
-        let lines: Vec<&str> = markdown.lines().collect();
-        assert_eq!(lines.len(), 4);
-    }
-    #[test]
-    fn test_markdown_handles_empty_input() {
-        let cells: Vec<Vec<String>> = vec![];
-        let markdown = cells_to_markdown(&cells);
-        assert_eq!(markdown, "");
-    }
-    #[test]
-    fn test_markdown_escapes_pipe_characters() {
-        let cells = vec![vec!["Header".to_string()], vec!["Cell with | pipe".to_string()]];
-        let markdown = cells_to_markdown(&cells);
-        assert!(markdown.contains("Cell with \\| pipe"));
-        for line in markdown.lines() {
-            if !line.is_empty() {
-                assert!(line.starts_with('|'));
-                assert!(line.ends_with('|'));
-            }
-        }
-    }
-    #[test]
-    fn test_markdown_pads_irregular_tables() {
-        let cells = vec![
-            vec!["H1".to_string(), "H2".to_string(), "H3".to_string()],
-            vec!["R1C1".to_string(), "R1C2".to_string()],
-            vec!["R2C1".to_string(), "R2C2".to_string(), "R2C3".to_string()],
-        ];
-        let markdown = cells_to_markdown(&cells);
-        assert!(markdown.contains("| H1 | H2 | H3 |"));
-        assert!(markdown.contains("| R1C1 | R1C2 | |"));
-        let lines: Vec<&str> = markdown.lines().filter(|l| !l.is_empty()).collect();
-        let pipe_counts: Vec<usize> = lines
-            .iter()
-            .map(|line| line.chars().filter(|c| *c == '|').count())
-            .collect();
-        assert!(pipe_counts.iter().all(|&count| count == pipe_counts[0]));
-    }
-    #[test]
-    fn test_markdown_single_row_table() {
-        let cells = vec![vec!["OnlyHeader".to_string()]];
-        let markdown = cells_to_markdown(&cells);
-        assert!(markdown.contains("| OnlyHeader |"));
-        assert!(markdown.contains("|------|"));
-        let lines: Vec<&str> = markdown.lines().collect();
-        assert_eq!(lines.len(), 2);
-    }
-    #[test]
-    fn test_markdown_single_column_table() {
-        let cells = vec![
-            vec!["Header".to_string()],
-            vec!["Data1".to_string()],
-            vec!["Data2".to_string()],
-        ];
-        let markdown = cells_to_markdown(&cells);
-        assert!(markdown.contains("| Header |"));
-        assert!(markdown.contains("|------|"));
-        assert!(markdown.contains("| Data1 |"));
-        assert!(markdown.contains("| Data2 |"));
-    }
-    #[test]
-    fn test_markdown_special_characters() {
-        let cells = vec![
-            vec!["*Header*".to_string(), "#Title".to_string()],
-            vec!["**Bold**".to_string(), "~~Strike~~".to_string()],
-        ];
-        let markdown = cells_to_markdown(&cells);
-        assert!(markdown.contains("*Header*"));
-        assert!(markdown.contains("#Title"));
-        assert!(markdown.contains("**Bold**"));
-        assert!(markdown.contains("~~Strike~~"));
-    }
-    #[test]
-    fn test_markdown_unicode_content() {
-        let cells = vec![
-            vec!["Emoji".to_string(), "Accents".to_string()],
-            vec!["🎉 Party".to_string(), "Café".to_string()],
-        ];
-        let markdown = cells_to_markdown(&cells);
-        assert!(markdown.contains("🎉 Party"));
-        assert!(markdown.contains("Café"));
-    }
-}

data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs DELETED Viewed

@@ -1,287 +0,0 @@
-//! ODT (OpenDocument) metadata extraction from meta.xml
-//!
-//! Extracts metadata from OpenDocument Text files following the OASIS OpenDocument standard.
-use crate::error::{KreuzbergError, Result};
-use std::io::Read;
-use zip::ZipArchive;
-/// OpenDocument metadata from meta.xml
-///
-/// Contains metadata fields defined by the OASIS OpenDocument Format standard.
-/// Uses Dublin Core elements (dc:) and OpenDocument meta elements (meta:).
-#[derive(Debug, Clone, Default, PartialEq)]
-pub struct OdtProperties {
-    /// Document title (dc:title)
-    pub title: Option<String>,
-    /// Document subject/topic (dc:subject)
-    pub subject: Option<String>,
-    /// Current document creator/author (dc:creator)
-    pub creator: Option<String>,
-    /// Initial creator of the document (meta:initial-creator)
-    pub initial_creator: Option<String>,
-    /// Keywords or tags (meta:keyword)
-    pub keywords: Option<String>,
-    /// Document description (dc:description)
-    pub description: Option<String>,
-    /// Current modification date (dc:date)
-    pub date: Option<String>,
-    /// Initial creation date (meta:creation-date)
-    pub creation_date: Option<String>,
-    /// Document language (dc:language)
-    pub language: Option<String>,
-    /// Generator/application that created the document (meta:generator)
-    pub generator: Option<String>,
-    /// Editing duration in ISO 8601 format (meta:editing-duration)
-    pub editing_duration: Option<String>,
-    /// Number of edits/revisions (meta:editing-cycles)
-    pub editing_cycles: Option<String>,
-    /// Document statistics - page count (meta:page-count)
-    pub page_count: Option<i32>,
-    /// Document statistics - word count (meta:word-count)
-    pub word_count: Option<i32>,
-    /// Document statistics - character count (meta:character-count)
-    pub character_count: Option<i32>,
-    /// Document statistics - paragraph count (meta:paragraph-count)
-    pub paragraph_count: Option<i32>,
-    /// Document statistics - table count (meta:table-count)
-    pub table_count: Option<i32>,
-    /// Document statistics - image count (meta:image-count)
-    pub image_count: Option<i32>,
-}
-/// Extract ODT metadata from an OpenDocument file
-///
-/// Parses `meta.xml` from the ZIP archive and extracts OpenDocument metadata.
-///
-/// # Arguments
-///
-/// * `archive` - ZIP archive containing the OpenDocument file
-///
-/// # Returns
-///
-/// Returns `OdtProperties` with extracted metadata. Fields that are not present
-/// in the document will be `None`.
-///
-/// # Errors
-///
-/// Returns an error if:
-/// - The ZIP archive cannot be read
-/// - The meta.xml file is malformed
-/// - XML parsing fails
-///
-/// # Example
-///
-/// ```no_run
-/// use kreuzberg::extraction::office_metadata::extract_odt_properties;
-/// use std::fs::File;
-/// use zip::ZipArchive;
-///
-/// let file = File::open("document.odt")?;
-/// let mut archive = ZipArchive::new(file)?;
-/// let props = extract_odt_properties(&mut archive)?;
-///
-/// println!("Title: {:?}", props.title);
-/// println!("Creator: {:?}", props.creator);
-/// println!("Created: {:?}", props.creation_date);
-/// # Ok::<(), Box<dyn std::error::Error>>(())
-/// ```
-pub fn extract_odt_properties<R: Read + std::io::Seek>(archive: &mut ZipArchive<R>) -> Result<OdtProperties> {
-    let mut xml_content = String::new();
-    match archive.by_name("meta.xml") {
-        Ok(mut file) => {
-            file.read_to_string(&mut xml_content)
-                .map_err(|e| KreuzbergError::parsing(format!("Failed to read meta.xml: {}", e)))?;
-        }
-        Err(_) => {
-            return Ok(OdtProperties::default());
-        }
-    }
-    let doc = roxmltree::Document::parse(&xml_content)
-        .map_err(|e| KreuzbergError::parsing(format!("Failed to parse meta.xml: {}", e)))?;
-    let root = doc.root_element();
-    // Extract Dublin Core elements
-    let title = super::parse_xml_text(root, "title");
-    let subject = super::parse_xml_text(root, "subject");
-    let creator = super::parse_xml_text(root, "creator");
-    let description = super::parse_xml_text(root, "description");
-    let language = super::parse_xml_text(root, "language");
-    let date = super::parse_xml_text(root, "date");
-    // Extract OpenDocument meta elements
-    let initial_creator = super::parse_xml_text(root, "initial-creator");
-    let keywords = super::parse_xml_text(root, "keyword");
-    let creation_date = super::parse_xml_text(root, "creation-date");
-    let generator = super::parse_xml_text(root, "generator");
-    let editing_duration = super::parse_xml_text(root, "editing-duration");
-    let editing_cycles = super::parse_xml_text(root, "editing-cycles");
-    // Extract document statistics
-    let page_count = super::parse_xml_int(root, "page-count");
-    let word_count = super::parse_xml_int(root, "word-count");
-    let character_count = super::parse_xml_int(root, "character-count");
-    let paragraph_count = super::parse_xml_int(root, "paragraph-count");
-    let table_count = super::parse_xml_int(root, "table-count");
-    let image_count = super::parse_xml_int(root, "image-count");
-    Ok(OdtProperties {
-        title,
-        subject,
-        creator,
-        initial_creator,
-        keywords,
-        description,
-        date,
-        creation_date,
-        language,
-        generator,
-        editing_duration,
-        editing_cycles,
-        page_count,
-        word_count,
-        character_count,
-        paragraph_count,
-        table_count,
-        image_count,
-    })
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::io::{Cursor, Write};
-    fn create_test_zip_with_meta_xml(meta_xml: &str) -> ZipArchive<Cursor<Vec<u8>>> {
-        let buffer = Vec::new();
-        let cursor = Cursor::new(buffer);
-        let mut zip = zip::ZipWriter::new(cursor);
-        let options = zip::write::FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
-        zip.start_file("meta.xml", options).unwrap();
-        zip.write_all(meta_xml.as_bytes()).unwrap();
-        let cursor = zip.finish().unwrap();
-        ZipArchive::new(cursor).unwrap()
-    }
-    #[test]
-    fn test_extract_odt_properties_full() {
-        let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
-<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
-                      xmlns:dc="http://purl.org/dc/elements/1.1/"
-                      xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
-                      office:version="1.3">
-  <office:meta>
-    <dc:title>Test Document</dc:title>
-    <dc:subject>Testing</dc:subject>
-    <dc:creator>John Doe</dc:creator>
-    <meta:initial-creator>Jane Smith</meta:initial-creator>
-    <dc:description>A test document for ODT metadata</dc:description>
-    <meta:keyword>test, metadata, odt</meta:keyword>
-    <dc:language>en-US</dc:language>
-    <meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
-    <dc:date>2024-01-02T15:30:00Z</dc:date>
-    <meta:generator>LibreOffice/24.2</meta:generator>
-    <meta:editing-duration>PT2H30M</meta:editing-duration>
-    <meta:editing-cycles>5</meta:editing-cycles>
-    <meta:page-count>10</meta:page-count>
-    <meta:word-count>1500</meta:word-count>
-    <meta:character-count>9000</meta:character-count>
-    <meta:paragraph-count>45</meta:paragraph-count>
-    <meta:table-count>3</meta:table-count>
-    <meta:image-count>7</meta:image-count>
-  </office:meta>
-</office:document-meta>"#;
-        let mut archive = create_test_zip_with_meta_xml(meta_xml);
-        let props = extract_odt_properties(&mut archive).unwrap();
-        assert_eq!(props.title, Some("Test Document".to_string()));
-        assert_eq!(props.subject, Some("Testing".to_string()));
-        assert_eq!(props.creator, Some("John Doe".to_string()));
-        assert_eq!(props.initial_creator, Some("Jane Smith".to_string()));
-        assert_eq!(props.keywords, Some("test, metadata, odt".to_string()));
-        assert_eq!(props.description, Some("A test document for ODT metadata".to_string()));
-        assert_eq!(props.language, Some("en-US".to_string()));
-        assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
-        assert_eq!(props.date, Some("2024-01-02T15:30:00Z".to_string()));
-        assert_eq!(props.generator, Some("LibreOffice/24.2".to_string()));
-        assert_eq!(props.editing_duration, Some("PT2H30M".to_string()));
-        assert_eq!(props.editing_cycles, Some("5".to_string()));
-        assert_eq!(props.page_count, Some(10));
-        assert_eq!(props.word_count, Some(1500));
-        assert_eq!(props.character_count, Some(9000));
-        assert_eq!(props.paragraph_count, Some(45));
-        assert_eq!(props.table_count, Some(3));
-        assert_eq!(props.image_count, Some(7));
-    }
-    #[test]
-    fn test_extract_odt_properties_minimal() {
-        let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
-<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
-                      xmlns:dc="http://purl.org/dc/elements/1.1/"
-                      xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
-                      office:version="1.3">
-  <office:meta>
-    <dc:creator>Alice</dc:creator>
-    <meta:creation-date>2024-01-01T10:00:00Z</meta:creation-date>
-  </office:meta>
-</office:document-meta>"#;
-        let mut archive = create_test_zip_with_meta_xml(meta_xml);
-        let props = extract_odt_properties(&mut archive).unwrap();
-        assert_eq!(props.creator, Some("Alice".to_string()));
-        assert_eq!(props.creation_date, Some("2024-01-01T10:00:00Z".to_string()));
-        assert_eq!(props.title, None);
-        assert_eq!(props.keywords, None);
-        assert_eq!(props.word_count, None);
-    }
-    #[test]
-    fn test_extract_odt_properties_empty_elements() {
-        let meta_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
-<office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
-                      xmlns:dc="http://purl.org/dc/elements/1.1/"
-                      xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
-                      office:version="1.3">
-  <office:meta>
-    <dc:title></dc:title>
-    <dc:creator>Bob</dc:creator>
-  </office:meta>
-</office:document-meta>"#;
-        let mut archive = create_test_zip_with_meta_xml(meta_xml);
-        let props = extract_odt_properties(&mut archive).unwrap();
-        assert_eq!(props.title, None);
-        assert_eq!(props.creator, Some("Bob".to_string()));
-    }
-    #[test]
-    fn test_extract_odt_properties_missing_file() {
-        let buffer = Vec::new();
-        let cursor = Cursor::new(buffer);
-        let zip = zip::ZipWriter::new(cursor);
-        let cursor = zip.finish().unwrap();
-        let mut archive = ZipArchive::new(cursor).unwrap();
-        let props = extract_odt_properties(&mut archive).unwrap();
-        assert_eq!(props, OdtProperties::default());
-    }
-    #[test]
-    fn test_extract_odt_properties_malformed_xml() {
-        let meta_xml = "not valid xml <";
-        let mut archive = create_test_zip_with_meta_xml(meta_xml);
-        let result = extract_odt_properties(&mut archive);
-        assert!(result.is_err());
-    }
-}