RubyGems - kreuzberg - Versions diffs - 4.2.0 → 4.2.2 - Mend

kreuzberg 4.2.0 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
data/lib/kreuzberg/cli.rb +16 -6
data/lib/kreuzberg/cli_proxy.rb +3 -1
data/lib/kreuzberg/config.rb +59 -28
data/lib/kreuzberg/djot_content.rb +225 -0
data/lib/kreuzberg/extraction_api.rb +20 -4
data/lib/kreuzberg/result.rb +12 -2
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +1 -0
data/sig/kreuzberg.rbs +23 -11
data/spec/binding/batch_spec.rb +6 -5
data/spec/binding/config_spec.rb +1 -1
data/spec/binding/error_recovery_spec.rb +3 -3
data/spec/binding/tables_spec.rb +11 -2
data/spec/unit/config/extraction_config_spec.rb +2 -2
data/spec/unit/config/output_format_spec.rb +18 -18
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +3 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +60 -0
data/vendor/kreuzberg/src/api/handlers.rs +153 -32
data/vendor/kreuzberg/src/api/mod.rs +2 -0
data/vendor/kreuzberg/src/api/openapi.rs +141 -0
data/vendor/kreuzberg/src/api/router.rs +24 -2
data/vendor/kreuzberg/src/api/startup.rs +21 -1
data/vendor/kreuzberg/src/api/types.rs +50 -4
data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
data/vendor/kreuzberg/src/core/io.rs +7 -7
data/vendor/kreuzberg/src/core/mime.rs +4 -4
data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
data/vendor/kreuzberg/tests/core_integration.rs +2 -4
data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
data/vendor/kreuzberg-ffi/src/types.rs +8 -5
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +5 -2

data/lib/kreuzberg/extraction_api.rb CHANGED Viewed

@@ -15,11 +15,15 @@ module Kreuzberg
     # @example Extract with explicit MIME type
     # @example Extract with OCR enabled
     def extract_file_sync(path:, mime_type: nil, config: nil)
+      # Validate that the file exists
+      path_str = path.to_s
+      raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
       opts = normalize_config(config)
       hash = if mime_type
-               native_extract_file_sync(path.to_s, mime_type.to_s, **opts)
+               native_extract_file_sync(path_str, mime_type.to_s, **opts)
              else
-               native_extract_file_sync(path.to_s, **opts)
+               native_extract_file_sync(path_str, **opts)
              end
       result = Result.new(hash)
       record_cache_entry!(result, opts)
@@ -53,6 +57,8 @@ module Kreuzberg
     #   response = HTTParty.get("https://example.com/document.docx")
     #   result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
     def extract_bytes_sync(data:, mime_type:, config: nil)
+      raise TypeError, "mime_type must be a String, got #{mime_type.inspect}" if mime_type.nil?
       opts = normalize_config(config)
       hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
       result = Result.new(hash)
@@ -92,6 +98,12 @@ module Kreuzberg
     #   config = Kreuzberg::Config::Extraction.new(force_ocr: true)
     #   results = Kreuzberg.batch_extract_files_sync(paths, config: config)
     def batch_extract_files_sync(paths:, config: nil)
+      # Validate that all files exist
+      paths.each do |path|
+        path_str = path.to_s
+        raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
+      end
       opts = normalize_config(config)
       hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
       results = hashes.map { |hash| Result.new(hash) }
@@ -130,11 +142,15 @@ module Kreuzberg
     #   )
     #   result = Kreuzberg.extract_file("document.pdf", config: config)
     def extract_file(path:, mime_type: nil, config: nil)
+      # Validate that the file exists
+      path_str = path.to_s
+      raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
       opts = normalize_config(config)
       hash = if mime_type
-               native_extract_file(path.to_s, mime_type.to_s, **opts)
+               native_extract_file(path_str, mime_type.to_s, **opts)
              else
-               native_extract_file(path.to_s, **opts)
+               native_extract_file(path_str, **opts)
              end
       result = Result.new(hash)
       record_cache_entry!(result, opts)

data/lib/kreuzberg/result.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Kreuzberg
   # rubocop:disable Metrics/ClassLength
   class Result
     attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
-                :detected_languages, :chunks, :images, :pages, :elements
+                :detected_languages, :chunks, :images, :pages, :elements, :djot_content
     # @!attribute [r] cells
     #   @return [Array<Array<String>>] Table cells (2D array)
@@ -180,6 +180,7 @@ module Kreuzberg
     #
     # @param hash [Hash] Hash returned from native extension
     #
+    # rubocop:disable Metrics/AbcSize
     def initialize(hash)
       @content = get_value(hash, 'content', '')
       @mime_type = get_value(hash, 'mime_type', '')
@@ -191,7 +192,9 @@ module Kreuzberg
       @images = parse_images(get_value(hash, 'images'))
       @pages = parse_pages(get_value(hash, 'pages'))
       @elements = parse_elements(get_value(hash, 'elements'))
+      @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
     end
+    # rubocop:enable Metrics/AbcSize
     # Convert to hash
     #
@@ -207,7 +210,8 @@ module Kreuzberg
         chunks: serialize_chunks,
         images: serialize_images,
         pages: serialize_pages,
-        elements: serialize_elements
+        elements: serialize_elements,
+        djot_content: @djot_content&.to_h
       }
     end
@@ -434,6 +438,12 @@ module Kreuzberg
         y1: coordinates_data['y1'].to_f
       )
     end
+    def parse_djot_content(djot_data)
+      return nil if djot_data.nil?
+      DjotContent.new(djot_data)
+    end
   end
   # rubocop:enable Metrics/ClassLength
 end

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.2.0'
+  VERSION = '4.2.2'
 end

data/lib/kreuzberg.rb CHANGED Viewed

@@ -87,6 +87,7 @@ end
 require_relative 'kreuzberg/cache_api'
 require_relative 'kreuzberg/extraction_api'
+require_relative 'kreuzberg/djot_content'
 Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
 Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)

data/sig/kreuzberg.rbs CHANGED Viewed

@@ -417,14 +417,23 @@ module Kreuzberg
       attr_reader plain_text: String
       attr_reader blocks: Array[DjotContent::FormattedBlock]
       attr_reader metadata: Hash[untyped, untyped]
-      attr_reader tables: Array[Table]
+      attr_reader metadata_json: String
+      attr_reader tables: Array[untyped]
       attr_reader images: Array[DjotContent::DjotImage]
       attr_reader links: Array[DjotContent::DjotLink]
       attr_reader footnotes: Array[DjotContent::Footnote]
       attr_reader attributes: Hash[String, untyped]?
-      def initialize: (djot_content_hash hash) -> void
-      def to_h: () -> djot_content_hash
+      def initialize: (untyped hash) -> void
+      def to_h: () -> Hash[Symbol, untyped]
+      private
+      def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
+      def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
+      def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
+      def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
+      def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
       class FormattedBlock
         attr_reader block_type: String
@@ -433,28 +442,31 @@ module Kreuzberg
         attr_reader children: Array[FormattedBlock]?
         attr_reader attributes: Hash[String, untyped]?
-        def initialize: (formatted_block_hash hash) -> void
-        def to_h: () -> formatted_block_hash
+        def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
+        def to_h: () -> Hash[Symbol, untyped]
       end
       class DjotImage
         attr_reader url: String
         attr_reader alt: String?
         attr_reader title: String?
-        attr_reader attributes: Hash[String, untyped]?
+        attr_reader width: Integer?
+        attr_reader height: Integer?
-        def initialize: (djot_image_hash hash) -> void
-        def to_h: () -> djot_image_hash
+        def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
+        def src: () -> String
+        def to_h: () -> Hash[Symbol, untyped]
       end
       class DjotLink
         attr_reader url: String
-        attr_reader text: String
+        attr_reader text: String?
         attr_reader title: String?
         attr_reader link_type: String?
-        def initialize: (djot_link_hash hash) -> void
-        def to_h: () -> djot_link_hash
+        def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
+        def href: () -> String
+        def to_h: () -> Hash[Symbol, untyped]
       end
       class Footnote

data/spec/binding/batch_spec.rb CHANGED Viewed

@@ -295,7 +295,7 @@ RSpec.describe Kreuzberg do
   end
   describe 'batch error handling' do
-    it 'handles missing files gracefully in batch' do
+    it 'raises IOError for missing files in batch' do
       paths = [
         '/nonexistent/file1.txt',
         '/nonexistent/file2.txt'
@@ -303,10 +303,10 @@ RSpec.describe Kreuzberg do
       expect do
         described_class.batch_extract_files_sync(paths: paths)
-      end.not_to raise_error
+      end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
     end
-    it 'handles mixed valid and invalid paths' do
+    it 'raises IOError when batch contains invalid paths' do
       paths = []
       temp_dir = Dir.mktmpdir
@@ -316,8 +316,9 @@ RSpec.describe Kreuzberg do
       paths << '/nonexistent/invalid.txt'
-      results = described_class.batch_extract_files_sync(paths: paths)
-      expect(results).to be_a(Array)
+      expect do
+        described_class.batch_extract_files_sync(paths: paths)
+      end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
     ensure
       FileUtils.remove_entry(temp_dir)
     end

data/spec/binding/config_spec.rb CHANGED Viewed

@@ -309,7 +309,7 @@ RSpec.describe Kreuzberg::Config do
       config = described_class.new
       expect(config.use_cache).to be true
-      expect(config.enable_quality_processing).to be false
+      expect(config.enable_quality_processing).to be true
       expect(config.force_ocr).to be false
       expect(config.ocr).to be_nil
       expect(config.chunking).to be_nil

data/spec/binding/error_recovery_spec.rb CHANGED Viewed

@@ -57,7 +57,7 @@ RSpec.describe 'Error Recovery' do
       nonexistent_path = '/nonexistent/file/that/does/not/exist.pdf'
       expect { Kreuzberg.extract_file_sync(path: nonexistent_path, config: config) }
-        .to raise_error(Kreuzberg::Errors::ValidationError, /not found|does not exist|no such file/)
+        .to raise_error(Kreuzberg::Errors::IOError, /not found|does not exist|no such file/)
     end
     it 'provides descriptive error messages for invalid MIME types' do
@@ -293,7 +293,7 @@ RSpec.describe 'Error Recovery' do
       expect(validation_error).to be_a(ArgumentError)
-      # Runtime error (file not found)
+      # Runtime error (file not found) - IOError since the file doesn't exist
       runtime_error = nil
       begin
         Kreuzberg.extract_file_sync(path: '/nonexistent/file.pdf')
@@ -301,7 +301,7 @@ RSpec.describe 'Error Recovery' do
         runtime_error = e
       end
-      expect(runtime_error).to be_a(Kreuzberg::Errors::ValidationError)
+      expect(runtime_error).to be_a(Kreuzberg::Errors::IOError)
     end
     it 'provides error recovery suggestions in messages' do

data/spec/binding/tables_spec.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 # frozen_string_literal: true
 require 'spec_helper'
+require 'tempfile'
+require 'fileutils'
 RSpec.describe 'Table Extraction Quality' do
   describe 'table structure extraction' do
@@ -523,12 +525,19 @@ RSpec.describe 'Table Extraction Quality' do
     it 'handles documents with no tables gracefully' do
       config = Kreuzberg::Config::Extraction.new
+      # Create a temporary text file for this test
+      file = Tempfile.new(['no_tables_test', '.txt'])
+      file.write('This is a text document without any tables.')
+      file.close
       begin
-        result = Kreuzberg.extract_file(path: 'test.txt', config: config)
+        result = Kreuzberg.extract_file(path: file.path, config: config)
         expect(result).not_to be_nil
         expect(result.tables).to be_a(Array) if result.tables
-      rescue Kreuzberg::Errors::ValidationError
+      rescue Kreuzberg::Errors::IOError
         skip 'Text file not available for testing'
+      ensure
+        FileUtils.rm_f(file.path)
       end
     end

data/spec/unit/config/extraction_config_spec.rb CHANGED Viewed

@@ -6,7 +6,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
       config = described_class.new
       expect(config.use_cache).to be true
-      expect(config.enable_quality_processing).to be false
+      expect(config.enable_quality_processing).to be true
       expect(config.force_ocr).to be false
       expect(config.ocr).to be_nil
       expect(config.chunking).to be_nil
@@ -103,7 +103,7 @@ RSpec.describe Kreuzberg::Config::Extraction do
       hash = config.to_h
       expect(hash[:use_cache]).to be true
-      expect(hash[:enable_quality_processing]).to be false
+      expect(hash[:enable_quality_processing]).to be true
       expect(hash[:force_ocr]).to be false
     end
   end

data/spec/unit/config/output_format_spec.rb CHANGED Viewed

@@ -282,34 +282,34 @@ RSpec.describe 'Output Format and Result Format Configuration' do
     end
     describe 'format validation and edge cases' do
-      it 'handles empty string output_format' do
-        config = described_class.new(output_format: '')
-        expect(config.output_format).to eq ''
+      it 'raises error for empty string output_format' do
+        expect do
+          described_class.new(output_format: '')
+        end.to raise_error(ArgumentError, /Invalid output_format/)
       end
-      it 'handles empty string result_format' do
-        config = described_class.new(result_format: '')
-        expect(config.result_format).to eq ''
+      it 'raises error for empty string result_format' do
+        expect do
+          described_class.new(result_format: '')
+        end.to raise_error(ArgumentError, /Invalid result_format/)
       end
-      it 'handles whitespace in output_format' do
-        config = described_class.new(output_format: '  plain  ')
-        expect(config.output_format).to eq '  plain  '
+      it 'raises error for whitespace in output_format' do
+        expect do
+          described_class.new(output_format: '  plain  ')
+        end.to raise_error(ArgumentError, /Invalid output_format/)
       end
-      it 'handles case sensitivity in output_format' do
+      it 'normalizes case in output_format' do
         config = described_class.new(output_format: 'MarkDown')
-        expect(config.output_format).to eq 'MarkDown'
+        expect(config.output_format).to eq 'markdown'
       end
-      it 'handles custom string in result_format' do
-        config = described_class.new(result_format: 'custom_format')
-        expect(config.result_format).to eq 'custom_format'
+      it 'raises error for custom string in result_format' do
+        expect do
+          described_class.new(result_format: 'custom_format')
+        end.to raise_error(ArgumentError, /Invalid result_format/)
       end
     end

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
 resolver = "2"
 [workspace.package]
-version = "4.2.0"
+version = "4.2.2"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.2.0"
+version = "4.2.2"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -71,7 +71,7 @@ keywords-yake = ["dep:yake-rust", "stopwords"]
 keywords-rake = ["dep:rake", "stopwords"]
 keywords = ["keywords-yake", "keywords-rake"]
-api = ["dep:axum", "dep:tower", "dep:tower-http", "tokio-runtime"]
+api = ["dep:axum", "dep:tower", "dep:tower-http", "dep:utoipa", "tokio-runtime"]
 mcp = ["dep:rmcp", "tokio-runtime"]
 mcp-http = ["mcp", "api"]
@@ -198,6 +198,7 @@ rake = { version = "0.3.6", optional = true }
 axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
 tower = { version = "0.5", optional = true }
 tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
+utoipa = { version = "5.3", features = ["axum_extras"], optional = true }
 rmcp = { version = "0.14.0", features = [
     "server",
     "macros",

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.2.0 Release**
+> **🚀 Version 4.2.2 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/api/error.rs CHANGED Viewed

@@ -2,14 +2,38 @@
 use axum::{
     Json,
+    extract::{FromRequest, Request, rejection::JsonRejection},
     http::StatusCode,
     response::{IntoResponse, Response},
 };
+use serde::de::DeserializeOwned;
 use crate::error::KreuzbergError;
 use super::types::ErrorResponse;
+/// Custom JSON extractor that returns JSON error responses instead of plain text.
+///
+/// This wraps axum's `Json` extractor but uses `ApiError` as the rejection type,
+/// ensuring that all JSON parsing errors are returned as JSON with proper content type.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct JsonApi<T>(pub T);
+impl<T, S> FromRequest<S> for JsonApi<T>
+where
+    T: DeserializeOwned,
+    S: Send + Sync,
+{
+    type Rejection = ApiError;
+    async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
+        match Json::<T>::from_request(req, state).await {
+            Ok(Json(value)) => Ok(JsonApi(value)),
+            Err(rejection) => Err(ApiError::from(rejection)),
+        }
+    }
+}
 /// API-specific error wrapper.
 #[derive(Debug)]
 pub struct ApiError {
@@ -79,3 +103,39 @@ impl From<KreuzbergError> for ApiError {
         }
     }
 }
+impl From<JsonRejection> for ApiError {
+    fn from(rejection: JsonRejection) -> Self {
+        let (status, message) = match rejection {
+            JsonRejection::JsonDataError(err) => (
+                StatusCode::UNPROCESSABLE_ENTITY,
+                format!(
+                    "Failed to deserialize the JSON body into the target type: {}",
+                    err.body_text()
+                ),
+            ),
+            JsonRejection::JsonSyntaxError(err) => (
+                StatusCode::BAD_REQUEST,
+                format!("Failed to parse the request body as JSON: {}", err.body_text()),
+            ),
+            JsonRejection::MissingJsonContentType(_) => (
+                StatusCode::UNSUPPORTED_MEDIA_TYPE,
+                "Expected request with `Content-Type: application/json`".to_string(),
+            ),
+            JsonRejection::BytesRejection(err) => {
+                (StatusCode::BAD_REQUEST, format!("Failed to read request body: {}", err))
+            }
+            _ => (StatusCode::BAD_REQUEST, "Unknown JSON parsing error".to_string()),
+        };
+        Self {
+            status,
+            body: ErrorResponse {
+                error_type: "JsonParsingError".to_string(),
+                message,
+                traceback: None,
+                status_code: status.as_u16(),
+            },
+        }
+    }
+}