RubyGems - kreuzberg - Versions diffs - 4.2.0 → 4.2.1 - Mend

kreuzberg 4.2.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
data/lib/kreuzberg/cli.rb +16 -6
data/lib/kreuzberg/cli_proxy.rb +3 -1
data/lib/kreuzberg/config.rb +56 -9
data/lib/kreuzberg/djot_content.rb +225 -0
data/lib/kreuzberg/extraction_api.rb +20 -4
data/lib/kreuzberg/result.rb +12 -2
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +1 -0
data/sig/kreuzberg.rbs +23 -11
data/spec/binding/batch_spec.rb +6 -5
data/spec/binding/error_recovery_spec.rb +3 -3
data/spec/binding/tables_spec.rb +11 -2
data/spec/unit/config/output_format_spec.rb +18 -18
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +1 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/startup.rs +15 -1
data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
data/vendor/kreuzberg/src/core/io.rs +7 -7
data/vendor/kreuzberg/src/core/mime.rs +4 -4
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
data/vendor/kreuzberg/tests/core_integration.rs +2 -4
data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +4 -2

data/lib/kreuzberg/result.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Kreuzberg
   # rubocop:disable Metrics/ClassLength
   class Result
     attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
-                :detected_languages, :chunks, :images, :pages, :elements
+                :detected_languages, :chunks, :images, :pages, :elements, :djot_content
     # @!attribute [r] cells
     #   @return [Array<Array<String>>] Table cells (2D array)
@@ -180,6 +180,7 @@ module Kreuzberg
     #
     # @param hash [Hash] Hash returned from native extension
     #
+    # rubocop:disable Metrics/AbcSize
     def initialize(hash)
       @content = get_value(hash, 'content', '')
       @mime_type = get_value(hash, 'mime_type', '')
@@ -191,7 +192,9 @@ module Kreuzberg
       @images = parse_images(get_value(hash, 'images'))
       @pages = parse_pages(get_value(hash, 'pages'))
       @elements = parse_elements(get_value(hash, 'elements'))
+      @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
     end
+    # rubocop:enable Metrics/AbcSize
     # Convert to hash
     #
@@ -207,7 +210,8 @@ module Kreuzberg
         chunks: serialize_chunks,
         images: serialize_images,
         pages: serialize_pages,
-        elements: serialize_elements
+        elements: serialize_elements,
+        djot_content: @djot_content&.to_h
       }
     end
@@ -434,6 +438,12 @@ module Kreuzberg
         y1: coordinates_data['y1'].to_f
       )
     end
+    def parse_djot_content(djot_data)
+      return nil if djot_data.nil?
+      DjotContent.new(djot_data)
+    end
   end
   # rubocop:enable Metrics/ClassLength
 end

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.2.0'
+  VERSION = '4.2.1'
 end

data/lib/kreuzberg.rb CHANGED Viewed

@@ -87,6 +87,7 @@ end
 require_relative 'kreuzberg/cache_api'
 require_relative 'kreuzberg/extraction_api'
+require_relative 'kreuzberg/djot_content'
 Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
 Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)

data/sig/kreuzberg.rbs CHANGED Viewed

@@ -417,14 +417,23 @@ module Kreuzberg
       attr_reader plain_text: String
       attr_reader blocks: Array[DjotContent::FormattedBlock]
       attr_reader metadata: Hash[untyped, untyped]
-      attr_reader tables: Array[Table]
+      attr_reader metadata_json: String
+      attr_reader tables: Array[untyped]
       attr_reader images: Array[DjotContent::DjotImage]
       attr_reader links: Array[DjotContent::DjotLink]
       attr_reader footnotes: Array[DjotContent::Footnote]
       attr_reader attributes: Hash[String, untyped]?
-      def initialize: (djot_content_hash hash) -> void
-      def to_h: () -> djot_content_hash
+      def initialize: (untyped hash) -> void
+      def to_h: () -> Hash[Symbol, untyped]
+      private
+      def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
+      def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
+      def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
+      def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
+      def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
       class FormattedBlock
         attr_reader block_type: String
@@ -433,28 +442,31 @@ module Kreuzberg
         attr_reader children: Array[FormattedBlock]?
         attr_reader attributes: Hash[String, untyped]?
-        def initialize: (formatted_block_hash hash) -> void
-        def to_h: () -> formatted_block_hash
+        def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
+        def to_h: () -> Hash[Symbol, untyped]
       end
       class DjotImage
         attr_reader url: String
         attr_reader alt: String?
         attr_reader title: String?
-        attr_reader attributes: Hash[String, untyped]?
+        attr_reader width: Integer?
+        attr_reader height: Integer?
-        def initialize: (djot_image_hash hash) -> void
-        def to_h: () -> djot_image_hash
+        def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
+        def src: () -> String
+        def to_h: () -> Hash[Symbol, untyped]
       end
       class DjotLink
         attr_reader url: String
-        attr_reader text: String
+        attr_reader text: String?
         attr_reader title: String?
         attr_reader link_type: String?
-        def initialize: (djot_link_hash hash) -> void
-        def to_h: () -> djot_link_hash
+        def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
+        def href: () -> String
+        def to_h: () -> Hash[Symbol, untyped]
       end
       class Footnote

data/spec/binding/batch_spec.rb CHANGED Viewed

@@ -295,7 +295,7 @@ RSpec.describe Kreuzberg do
   end
   describe 'batch error handling' do
-    it 'handles missing files gracefully in batch' do
+    it 'raises IOError for missing files in batch' do
       paths = [
         '/nonexistent/file1.txt',
         '/nonexistent/file2.txt'
@@ -303,10 +303,10 @@ RSpec.describe Kreuzberg do
       expect do
         described_class.batch_extract_files_sync(paths: paths)
-      end.not_to raise_error
+      end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
     end
-    it 'handles mixed valid and invalid paths' do
+    it 'raises IOError when batch contains invalid paths' do
       paths = []
       temp_dir = Dir.mktmpdir
@@ -316,8 +316,9 @@ RSpec.describe Kreuzberg do
       paths << '/nonexistent/invalid.txt'
-      results = described_class.batch_extract_files_sync(paths: paths)
-      expect(results).to be_a(Array)
+      expect do
+        described_class.batch_extract_files_sync(paths: paths)
+      end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
     ensure
       FileUtils.remove_entry(temp_dir)
     end

data/spec/binding/error_recovery_spec.rb CHANGED Viewed

@@ -57,7 +57,7 @@ RSpec.describe 'Error Recovery' do
       nonexistent_path = '/nonexistent/file/that/does/not/exist.pdf'
       expect { Kreuzberg.extract_file_sync(path: nonexistent_path, config: config) }
-        .to raise_error(Kreuzberg::Errors::ValidationError, /not found|does not exist|no such file/)
+        .to raise_error(Kreuzberg::Errors::IOError, /not found|does not exist|no such file/)
     end
     it 'provides descriptive error messages for invalid MIME types' do
@@ -293,7 +293,7 @@ RSpec.describe 'Error Recovery' do
       expect(validation_error).to be_a(ArgumentError)
-      # Runtime error (file not found)
+      # Runtime error (file not found) - IOError since the file doesn't exist
       runtime_error = nil
       begin
         Kreuzberg.extract_file_sync(path: '/nonexistent/file.pdf')
@@ -301,7 +301,7 @@ RSpec.describe 'Error Recovery' do
         runtime_error = e
       end
-      expect(runtime_error).to be_a(Kreuzberg::Errors::ValidationError)
+      expect(runtime_error).to be_a(Kreuzberg::Errors::IOError)
     end
     it 'provides error recovery suggestions in messages' do

data/spec/binding/tables_spec.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 # frozen_string_literal: true
 require 'spec_helper'
+require 'tempfile'
+require 'fileutils'
 RSpec.describe 'Table Extraction Quality' do
   describe 'table structure extraction' do
@@ -523,12 +525,19 @@ RSpec.describe 'Table Extraction Quality' do
     it 'handles documents with no tables gracefully' do
       config = Kreuzberg::Config::Extraction.new
+      # Create a temporary text file for this test
+      file = Tempfile.new(['no_tables_test', '.txt'])
+      file.write('This is a text document without any tables.')
+      file.close
       begin
-        result = Kreuzberg.extract_file(path: 'test.txt', config: config)
+        result = Kreuzberg.extract_file(path: file.path, config: config)
         expect(result).not_to be_nil
         expect(result.tables).to be_a(Array) if result.tables
-      rescue Kreuzberg::Errors::ValidationError
+      rescue Kreuzberg::Errors::IOError
         skip 'Text file not available for testing'
+      ensure
+        FileUtils.rm_f(file.path)
       end
     end

data/spec/unit/config/output_format_spec.rb CHANGED Viewed

@@ -282,34 +282,34 @@ RSpec.describe 'Output Format and Result Format Configuration' do
     end
     describe 'format validation and edge cases' do
-      it 'handles empty string output_format' do
-        config = described_class.new(output_format: '')
-        expect(config.output_format).to eq ''
+      it 'raises error for empty string output_format' do
+        expect do
+          described_class.new(output_format: '')
+        end.to raise_error(ArgumentError, /Invalid output_format/)
       end
-      it 'handles empty string result_format' do
-        config = described_class.new(result_format: '')
-        expect(config.result_format).to eq ''
+      it 'raises error for empty string result_format' do
+        expect do
+          described_class.new(result_format: '')
+        end.to raise_error(ArgumentError, /Invalid result_format/)
       end
-      it 'handles whitespace in output_format' do
-        config = described_class.new(output_format: '  plain  ')
-        expect(config.output_format).to eq '  plain  '
+      it 'raises error for whitespace in output_format' do
+        expect do
+          described_class.new(output_format: '  plain  ')
+        end.to raise_error(ArgumentError, /Invalid output_format/)
       end
-      it 'handles case sensitivity in output_format' do
+      it 'normalizes case in output_format' do
         config = described_class.new(output_format: 'MarkDown')
-        expect(config.output_format).to eq 'MarkDown'
+        expect(config.output_format).to eq 'markdown'
       end
-      it 'handles custom string in result_format' do
-        config = described_class.new(result_format: 'custom_format')
-        expect(config.result_format).to eq 'custom_format'
+      it 'raises error for custom string in result_format' do
+        expect do
+          described_class.new(result_format: 'custom_format')
+        end.to raise_error(ArgumentError, /Invalid result_format/)
       end
     end

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
 resolver = "2"
 [workspace.package]
-version = "4.2.0"
+version = "4.2.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.2.0"
+version = "4.2.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.2.0 Release**
+> **🚀 Version 4.2.1 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/api/startup.rs CHANGED Viewed

@@ -2,7 +2,7 @@
 use std::net::{IpAddr, SocketAddr};
-use crate::{ExtractionConfig, Result, core::ServerConfig};
+use crate::{ExtractionConfig, Result, core::ServerConfig, plugins::startup_validation::validate_plugins_at_startup};
 use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
@@ -80,6 +80,9 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
         server_config.max_multipart_field_bytes,
     );
+    // Validate plugins at startup
+    validate_plugins_at_startup()?;
     serve_with_config_and_limits(host, port, extraction_config, limits).await
 }
@@ -111,6 +114,10 @@ pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: Extract
         "Upload size limit: 100 MB (default, {} bytes)",
         limits.max_request_body_bytes
     );
+    // Validate plugins at startup
+    validate_plugins_at_startup()?;
     serve_with_config_and_limits(host, port, config, limits).await
 }
@@ -158,6 +165,9 @@ pub async fn serve_with_config_and_limits(
     let addr = SocketAddr::new(ip, port);
     let app = create_router_with_limits_and_server_config(config, limits, server_config);
+    // Validate plugins at startup
+    validate_plugins_at_startup()?;
     tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
     let listener = tokio::net::TcpListener::bind(addr)
@@ -214,6 +224,9 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
     let addr = SocketAddr::new(ip, server_config.port);
     let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
+    // Validate plugins at startup
+    validate_plugins_at_startup()?;
     tracing::info!(
         "Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
         ip,
@@ -238,6 +251,7 @@ pub async fn serve_with_server_config(extraction_config: ExtractionConfig, serve
 /// Defaults: host = "127.0.0.1", port = 8000
 ///
 /// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
+/// Validates plugins at startup to help diagnose configuration issues.
 pub async fn serve_default() -> Result<()> {
     serve("127.0.0.1", 8000).await
 }

data/vendor/kreuzberg/src/core/config_validation/sections.rs CHANGED Viewed

@@ -30,8 +30,10 @@ const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
 /// Valid tesseract OEM (OCR Engine Mode) values.
 const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
-/// Valid output formats for tesseract.
-const VALID_OUTPUT_FORMATS: &[&str] = &["text", "markdown"];
+/// Valid output formats for document extraction.
+/// Supports plain text, markdown, djot, and HTML output formats.
+/// Also accepts aliases: "text" for "plain", "md" for "markdown".
+const VALID_OUTPUT_FORMATS: &[&str] = &["plain", "text", "markdown", "md", "djot", "html"];
 /// Validate a binarization method string.
 ///
@@ -248,11 +250,17 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
     }
 }
-/// Validate a tesseract output format.
+/// Validate a document extraction output format.
+///
+/// Accepts the following formats and aliases:
+/// - "plain" or "text" for plain text output
+/// - "markdown" or "md" for Markdown output
+/// - "djot" for Djot markup format
+/// - "html" for HTML output
 ///
 /// # Arguments
 ///
-/// * `format` - The output format to validate (e.g., "text", "markdown")
+/// * `format` - The output format to validate
 ///
 /// # Returns
 ///
@@ -264,7 +272,11 @@ pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
 /// use kreuzberg::core::config_validation::validate_output_format;
 ///
 /// assert!(validate_output_format("text").is_ok());
+/// assert!(validate_output_format("plain").is_ok());
 /// assert!(validate_output_format("markdown").is_ok());
+/// assert!(validate_output_format("md").is_ok());
+/// assert!(validate_output_format("djot").is_ok());
+/// assert!(validate_output_format("html").is_ok());
 /// assert!(validate_output_format("json").is_err());
 /// ```
 pub fn validate_output_format(format: &str) -> Result<()> {

data/vendor/kreuzberg/src/core/extractor/file.rs CHANGED Viewed

@@ -106,9 +106,8 @@ pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
 ///
 /// # Errors
 ///
-/// Returns `KreuzbergError::Validation` if the file doesn't exist or path is invalid.
+/// Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
 /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
-/// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
 ///
 /// # Example
 ///

data/vendor/kreuzberg/src/core/extractor/mod.rs CHANGED Viewed

@@ -411,7 +411,8 @@ mod tests {
         assert!(result.is_err());
         use crate::KreuzbergError;
-        assert!(matches!(result.unwrap_err(), KreuzbergError::Validation { .. }));
+        // File validation returns Io error, not Validation error
+        assert!(matches!(result.unwrap_err(), KreuzbergError::Io { .. }));
     }
     #[test]

data/vendor/kreuzberg/src/core/io.rs CHANGED Viewed

@@ -61,12 +61,12 @@ pub fn file_exists(path: impl AsRef<Path>) -> bool {
 ///
 /// # Errors
 ///
-/// Returns `KreuzbergError::Validation` if file doesn't exist.
+/// Returns `KreuzbergError::Io` if file doesn't exist.
 pub fn validate_file_exists(path: impl AsRef<Path>) -> Result<()> {
     if !file_exists(&path) {
-        return Err(KreuzbergError::validation(format!(
-            "File does not exist: {}",
-            path.as_ref().display()
+        return Err(KreuzbergError::from(std::io::Error::new(
+            std::io::ErrorKind::NotFound,
+            format!("File does not exist: {}", path.as_ref().display()),
         )));
     }
     Ok(())
@@ -99,9 +99,9 @@ where
     let mut files = Vec::new();
     if !dir.is_dir() {
-        return Err(KreuzbergError::validation(format!(
-            "Path is not a directory: {}",
-            dir.display()
+        return Err(KreuzbergError::from(std::io::Error::new(
+            std::io::ErrorKind::NotADirectory,
+            format!("Path is not a directory: {}", dir.display()),
         )));
     }

data/vendor/kreuzberg/src/core/mime.rs CHANGED Viewed

@@ -231,15 +231,15 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
 ///
 /// # Errors
 ///
-/// Returns `KreuzbergError::Validation` if file doesn't exist (when `check_exists` is true).
+/// Returns `KreuzbergError::Io` if file doesn't exist (when `check_exists` is true).
 /// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
 pub fn detect_mime_type(path: impl AsRef<Path>, check_exists: bool) -> Result<String> {
     let path = path.as_ref();
     if check_exists && !path.exists() {
-        return Err(KreuzbergError::validation(format!(
-            "File does not exist: {}",
-            path.display()
+        return Err(KreuzbergError::from(std::io::Error::new(
+            std::io::ErrorKind::NotFound,
+            format!("File does not exist: {}", path.display()),
         )));
     }

data/vendor/kreuzberg/src/extraction/pptx/parser.rs CHANGED Viewed

@@ -384,5 +384,11 @@ pub(super) fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
         }
     }
+    // Sort slide paths to ensure correct ordering regardless of XML order.
+    // PowerPoint doesn't guarantee relationship order in the rels file.
+    // GitHub Issue #329: Without sorting, slides can be processed in wrong order,
+    // causing images to have incorrect page numbers.
+    slide_paths.sort();
     Ok(slide_paths)
 }

data/vendor/kreuzberg/src/plugins/mod.rs CHANGED Viewed

@@ -206,6 +206,7 @@ mod extractor;
 mod ocr;
 mod processor;
 pub mod registry;
+pub mod startup_validation;
 mod traits;
 mod validator;