RubyGems - kreuzberg - Versions diffs - 4.1.2 → 4.2.1 - Mend

kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
data/kreuzberg.gemspec +13 -1
data/lib/kreuzberg/cli.rb +16 -6
data/lib/kreuzberg/cli_proxy.rb +3 -1
data/lib/kreuzberg/config.rb +121 -39
data/lib/kreuzberg/djot_content.rb +225 -0
data/lib/kreuzberg/extraction_api.rb +20 -4
data/lib/kreuzberg/result.rb +12 -2
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +1 -0
data/sig/kreuzberg.rbs +28 -12
data/spec/binding/batch_operations_spec.rb +80 -0
data/spec/binding/batch_spec.rb +6 -5
data/spec/binding/error_recovery_spec.rb +3 -3
data/spec/binding/metadata_types_spec.rb +77 -57
data/spec/binding/tables_spec.rb +11 -2
data/spec/serialization_spec.rb +134 -0
data/spec/unit/config/output_format_spec.rb +380 -0
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +1 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/startup.rs +15 -1
data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
data/vendor/kreuzberg/src/core/io.rs +7 -7
data/vendor/kreuzberg/src/core/mime.rs +4 -4
data/vendor/kreuzberg/src/embeddings.rs +4 -4
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
data/vendor/kreuzberg/src/mcp/format.rs +237 -39
data/vendor/kreuzberg/src/mcp/params.rs +26 -33
data/vendor/kreuzberg/src/mcp/server.rs +6 -3
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
data/vendor/kreuzberg/tests/api_embed.rs +84 -50
data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
data/vendor/kreuzberg/tests/api_tests.rs +298 -139
data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
data/vendor/kreuzberg/tests/config_features.rs +19 -15
data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
data/vendor/kreuzberg/tests/core_integration.rs +57 -57
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
data/vendor/kreuzberg/tests/email_integration.rs +7 -7
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/error_handling.rs +13 -11
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/page_markers.rs +1 -1
data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
data/vendor/kreuzberg/tests/security_validation.rs +20 -19
data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +12 -2

data/spec/binding/metadata_types_spec.rb CHANGED Viewed

@@ -1154,64 +1154,11 @@ RSpec.describe 'Kreuzberg Metadata Types' do
   describe 'Thread Safety: Concurrent Extraction' do
     it 'handles concurrent extraction safely' do
-      test_files = []
-      results = []
-      errors = []
+      test_files = create_concurrent_test_files
+      results, errors = run_concurrent_extractions(test_files)
-      5.times do |i|
-        html_content = <<~HTML
-          <html>
-          <head>
-            <title>Concurrent Test #{i}</title>
-            <meta name="description" content="Test document #{i}">
-            <meta name="keywords" content="test#{i}, concurrent, thread-safe">
-          </head>
-          <body>
-            <h1>Test Document #{i}</h1>
-            <p>Content for test #{i}</p>
-            <a href="/page-#{i}">Link #{i}</a>
-            <img src="image-#{i}.jpg" alt="Image #{i}">
-          </body>
-          </html>
-        HTML
-        test_files << create_test_html_file(html_content)
-      end
-      begin
-        threads = test_files.map do |file|
-          Thread.new do
-            result = Kreuzberg.extract_file_sync(path: file)
-            results << result
-          rescue StandardError => e
-            errors << e
-          end
-        end
-        threads.each(&:join)
-        expect(errors).to be_empty
-        expect(results.length).to eq(5)
-        results.each do |result|
-          expect(result).to be_a(Kreuzberg::Result)
-          expect(result.metadata).not_to be_nil
-          metadata = result.metadata
-          next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
-          expect(metadata.title).not_to be_nil
-          expect(metadata.description).not_to be_nil
-          expect(metadata.keywords).to be_a(Array)
-          expect(metadata.headers).to be_a(Array)
-          expect(metadata.links).to be_a(Array)
-          expect(metadata.images).to be_a(Array)
-        end
-        titles = results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
-        expect(titles.uniq.length).to eq(5)
-      ensure
-        test_files.each { |f| FileUtils.rm_f(f) }
-      end
+      expect(results).not_to be_empty
+      verify_concurrent_results(results, errors, test_files)
     end
   end
@@ -1225,4 +1172,77 @@ RSpec.describe 'Kreuzberg Metadata Types' do
     file.close
     file.path
   end
+  def create_concurrent_test_files
+    test_files = []
+    5.times do |i|
+      html_content = <<~HTML
+        <html>
+        <head>
+          <title>Concurrent Test #{i}</title>
+          <meta name="description" content="Test document #{i}">
+          <meta name="keywords" content="test#{i}, concurrent, thread-safe">
+        </head>
+        <body>
+          <h1>Test Document #{i}</h1>
+          <p>Content for test #{i}</p>
+          <a href="/page-#{i}">Link #{i}</a>
+          <img src="image-#{i}.jpg" alt="Image #{i}">
+        </body>
+        </html>
+      HTML
+      test_files << create_test_html_file(html_content)
+    end
+    test_files
+  end
+  def run_concurrent_extractions(test_files)
+    results = []
+    errors = []
+    threads = test_files.map do |file|
+      Thread.new do
+        result = Kreuzberg.extract_file_sync(path: file)
+        results << result
+      rescue StandardError => e
+        errors << e
+      end
+    end
+    threads.each(&:join)
+    [results, errors]
+  end
+  def verify_concurrent_results(results, errors, test_files)
+    expect(errors).to be_empty
+    expect(results.length).to eq(5)
+    results.each do |result|
+      expect(result).to be_a(Kreuzberg::Result)
+      expect(result.metadata).not_to be_nil
+      metadata = result.metadata
+      next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
+      verify_metadata_fields(metadata)
+    end
+    titles = extract_titles(results)
+    expect(titles.uniq.length).to eq(5)
+  ensure
+    test_files.each { |f| FileUtils.rm_f(f) }
+  end
+  def verify_metadata_fields(metadata)
+    expect(metadata.title).not_to be_nil
+    expect(metadata.description).not_to be_nil
+    expect(metadata.keywords).to be_a(Array)
+    expect(metadata.headers).to be_a(Array)
+    expect(metadata.links).to be_a(Array)
+    expect(metadata.images).to be_a(Array)
+  end
+  def extract_titles(results)
+    results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
+  end
 end

data/spec/binding/tables_spec.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 # frozen_string_literal: true
 require 'spec_helper'
+require 'tempfile'
+require 'fileutils'
 RSpec.describe 'Table Extraction Quality' do
   describe 'table structure extraction' do
@@ -523,12 +525,19 @@ RSpec.describe 'Table Extraction Quality' do
     it 'handles documents with no tables gracefully' do
       config = Kreuzberg::Config::Extraction.new
+      # Create a temporary text file for this test
+      file = Tempfile.new(['no_tables_test', '.txt'])
+      file.write('This is a text document without any tables.')
+      file.close
       begin
-        result = Kreuzberg.extract_file(path: 'test.txt', config: config)
+        result = Kreuzberg.extract_file(path: file.path, config: config)
         expect(result).not_to be_nil
         expect(result.tables).to be_a(Array) if result.tables
-      rescue Kreuzberg::Errors::ValidationError
+      rescue Kreuzberg::Errors::IOError
         skip 'Text file not available for testing'
+      ensure
+        FileUtils.rm_f(file.path)
       end
     end

data/spec/serialization_spec.rb ADDED Viewed

@@ -0,0 +1,134 @@
+# frozen_string_literal: true
+# Cross-language serialization tests for Ruby bindings
+#
+# Validates that ExtractionConfig serializes consistently with other language bindings
+require 'json'
+require 'spec_helper'
+RSpec.describe Kreuzberg::ExtractionConfig do
+  describe '#to_h' do
+    it 'serializes minimal config to hash' do
+      config = described_class.new
+      hash = config.to_h
+      expect(hash).to be_a(Hash)
+      expect(hash).to have_key(:use_cache)
+      expect(hash).to have_key(:enable_quality_processing)
+      expect(hash).to have_key(:force_ocr)
+    end
+    it 'serializes config with all fields' do
+      config = described_class.new(
+        use_cache: true,
+        enable_quality_processing: true,
+        force_ocr: false
+      )
+      hash = config.to_h
+      expect(hash[:use_cache]).to be(true)
+      expect(hash[:enable_quality_processing]).to be(true)
+      expect(hash[:force_ocr]).to be(false)
+    end
+    it 'preserves field values after serialization' do
+      original = described_class.new(
+        use_cache: false,
+        enable_quality_processing: true
+      )
+      hash = original.to_h
+      expect(hash[:use_cache]).to be(false)
+      expect(hash[:enable_quality_processing]).to be(true)
+    end
+  end
+  describe '#to_json' do
+    it 'serializes to JSON' do
+      config = described_class.new(use_cache: true)
+      json = config.to_json
+      expect(json).to be_a(String)
+      parsed = JSON.parse(json, symbolize_names: true)
+      expect(parsed).to have_key(:use_cache)
+      expect(parsed[:use_cache]).to be(true)
+    end
+    it 'produces valid JSON' do
+      config = described_class.new
+      json = config.to_json
+      expect { JSON.parse(json) }.not_to raise_error
+    end
+    it 'uses snake_case field names' do
+      config = described_class.new(use_cache: true)
+      json = config.to_json
+      expect(json).to include('use_cache')
+      expect(json).not_to include('useCache')
+    end
+  end
+  describe 'round-trip serialization' do
+    it 'survives serialization -> deserialization -> serialization' do
+      config1 = described_class.new(
+        use_cache: true,
+        enable_quality_processing: false
+      )
+      json1 = config1.to_json
+      hash1 = JSON.parse(json1, symbolize_names: true)
+      config2 = described_class.new(hash1)
+      json2 = config2.to_json
+      # JSON strings should be equivalent
+      expect(JSON.parse(json1)).to eq(JSON.parse(json2))
+    end
+  end
+  describe 'field consistency' do
+    it 'includes all mandatory fields' do
+      config = described_class.new
+      hash = config.to_h
+      mandatory_fields = %i[use_cache enable_quality_processing force_ocr]
+      mandatory_fields.each do |field|
+        expect(hash).to have_key(field)
+      end
+    end
+    it 'handles nested ocr config' do
+      config = described_class.new(
+        ocr: {
+          backend: 'tesseract',
+          language: 'eng'
+        }
+      )
+      hash = config.to_h
+      expect(hash).to have_key(:ocr)
+      expect(hash[:ocr][:backend]).to eq('tesseract')
+      expect(hash[:ocr][:language]).to eq('eng')
+    end
+  end
+  describe 'immutability' do
+    it 'does not modify original config during serialization' do
+      config = described_class.new(use_cache: true)
+      json1 = config.to_json
+      json2 = config.to_json
+      json3 = config.to_json
+      expect(json1).to eq(json2)
+      expect(json2).to eq(json3)
+    end
+  end
+end

data/spec/unit/config/output_format_spec.rb ADDED Viewed

@@ -0,0 +1,380 @@
+# frozen_string_literal: true
+# rubocop:disable RSpec/RepeatedExample
+RSpec.describe 'Output Format and Result Format Configuration' do
+  describe Kreuzberg::Config::Extraction do
+    describe 'output_format' do
+      it 'accepts output_format as initialization parameter' do
+        config = described_class.new(output_format: 'markdown')
+        expect(config.output_format).to eq 'markdown'
+      end
+      it 'defaults to nil when not specified' do
+        config = described_class.new
+        expect(config.output_format).to be_nil
+      end
+      it 'accepts plain format' do
+        config = described_class.new(output_format: 'plain')
+        expect(config.output_format).to eq 'plain'
+      end
+      it 'accepts markdown format' do
+        config = described_class.new(output_format: 'markdown')
+        expect(config.output_format).to eq 'markdown'
+      end
+      it 'accepts djot format' do
+        config = described_class.new(output_format: 'djot')
+        expect(config.output_format).to eq 'djot'
+      end
+      it 'accepts html format' do
+        config = described_class.new(output_format: 'html')
+        expect(config.output_format).to eq 'html'
+      end
+      it 'converts output_format to string' do
+        config = described_class.new(output_format: :markdown)
+        expect(config.output_format).to eq 'markdown'
+        expect(config.output_format).to be_a String
+      end
+      it 'includes output_format in to_h' do
+        config = described_class.new(output_format: 'markdown')
+        hash = config.to_h
+        expect(hash[:output_format]).to eq 'markdown'
+      end
+      it 'excludes nil output_format from to_h' do
+        config = described_class.new(output_format: nil)
+        hash = config.to_h
+        expect(hash.key?(:output_format)).to be false
+      end
+      it 'includes output_format in JSON' do
+        config = described_class.new(output_format: 'markdown')
+        json = config.to_json
+        parsed = JSON.parse(json)
+        expect(parsed['output_format']).to eq 'markdown'
+      end
+      it 'retrieves output_format with get_field' do
+        config = described_class.new(output_format: 'djot')
+        expect(config.get_field('output_format')).to eq 'djot'
+      end
+      it 'can be set with []=' do
+        config = described_class.new
+        config[:output_format] = 'html'
+        expect(config.output_format).to eq 'html'
+      end
+      it 'can be set with []= using symbol' do
+        config = described_class.new
+        config[:output_format] = :plain
+        expect(config.output_format).to eq 'plain'
+      end
+      it 'can be retrieved with []' do
+        config = described_class.new(output_format: 'markdown')
+        expect(config[:output_format]).to eq 'markdown'
+      end
+    end
+    describe 'result_format' do
+      it 'accepts result_format as initialization parameter' do
+        config = described_class.new(result_format: 'unified')
+        expect(config.result_format).to eq 'unified'
+      end
+      it 'defaults to nil when not specified' do
+        config = described_class.new
+        expect(config.result_format).to be_nil
+      end
+      it 'accepts unified format' do
+        config = described_class.new(result_format: 'unified')
+        expect(config.result_format).to eq 'unified'
+      end
+      it 'accepts element_based format' do
+        config = described_class.new(result_format: 'element_based')
+        expect(config.result_format).to eq 'element_based'
+      end
+      it 'converts result_format to string' do
+        config = described_class.new(result_format: :unified)
+        expect(config.result_format).to eq 'unified'
+        expect(config.result_format).to be_a String
+      end
+      it 'includes result_format in to_h' do
+        config = described_class.new(result_format: 'element_based')
+        hash = config.to_h
+        expect(hash[:result_format]).to eq 'element_based'
+      end
+      it 'excludes nil result_format from to_h' do
+        config = described_class.new(result_format: nil)
+        hash = config.to_h
+        expect(hash.key?(:result_format)).to be false
+      end
+      it 'includes result_format in JSON' do
+        config = described_class.new(result_format: 'element_based')
+        json = config.to_json
+        parsed = JSON.parse(json)
+        expect(parsed['result_format']).to eq 'element_based'
+      end
+      it 'retrieves result_format with get_field' do
+        config = described_class.new(result_format: 'unified')
+        expect(config.get_field('result_format')).to eq 'unified'
+      end
+      it 'can be set with []=' do
+        config = described_class.new
+        config[:result_format] = 'unified'
+        expect(config.result_format).to eq 'unified'
+      end
+      it 'can be set with []= using symbol' do
+        config = described_class.new
+        config[:result_format] = :element_based
+        expect(config.result_format).to eq 'element_based'
+      end
+      it 'can be retrieved with []' do
+        config = described_class.new(result_format: 'element_based')
+        expect(config[:result_format]).to eq 'element_based'
+      end
+    end
+    describe 'combined output and result formats' do
+      it 'accepts both output_format and result_format' do
+        config = described_class.new(
+          output_format: 'markdown',
+          result_format: 'unified'
+        )
+        expect(config.output_format).to eq 'markdown'
+        expect(config.result_format).to eq 'unified'
+      end
+      it 'serializes both formats in to_h' do
+        config = described_class.new(
+          output_format: 'djot',
+          result_format: 'element_based'
+        )
+        hash = config.to_h
+        expect(hash[:output_format]).to eq 'djot'
+        expect(hash[:result_format]).to eq 'element_based'
+      end
+      it 'serializes both formats in JSON' do
+        config = described_class.new(
+          output_format: 'html',
+          result_format: 'unified'
+        )
+        json = config.to_json
+        parsed = JSON.parse(json)
+        expect(parsed['output_format']).to eq 'html'
+        expect(parsed['result_format']).to eq 'unified'
+      end
+      it 'merges both formats correctly' do
+        base = described_class.new(
+          output_format: 'markdown',
+          result_format: 'unified'
+        )
+        override = described_class.new(output_format: 'html')
+        merged = base.merge(override)
+        expect(merged.output_format).to eq 'html'
+        expect(merged.result_format).to eq 'unified'
+      end
+      it 'merges both formats with merge!' do
+        config = described_class.new(
+          output_format: 'markdown',
+          result_format: 'unified'
+        )
+        override = described_class.new(
+          output_format: 'djot',
+          result_format: 'element_based'
+        )
+        config.merge!(override)
+        expect(config.output_format).to eq 'djot'
+        expect(config.result_format).to eq 'element_based'
+      end
+      it 'handles merge with hash containing both formats' do
+        config = described_class.new(
+          output_format: 'plain',
+          result_format: 'unified'
+        )
+        merged = config.merge({ output_format: 'markdown' })
+        expect(merged.output_format).to eq 'markdown'
+        expect(merged.result_format).to eq 'unified'
+      end
+    end
+    describe 'format persistence across operations' do
+      it 'persists output_format through multiple conversions' do
+        config = described_class.new(output_format: 'markdown')
+        hash = config.to_h
+        new_config = described_class.new(**hash)
+        expect(new_config.output_format).to eq 'markdown'
+      end
+      it 'persists result_format through multiple conversions' do
+        config = described_class.new(result_format: 'element_based')
+        hash = config.to_h
+        new_config = described_class.new(**hash)
+        expect(new_config.result_format).to eq 'element_based'
+      end
+      it 'round-trips through JSON' do
+        config = described_class.new(
+          output_format: 'djot',
+          result_format: 'unified'
+        )
+        json = config.to_json
+        parsed = JSON.parse(json)
+        new_config = described_class.new(**parsed.transform_keys(&:to_sym))
+        expect(new_config.output_format).to eq 'djot'
+        expect(new_config.result_format).to eq 'unified'
+      end
+    end
+    describe 'format validation and edge cases' do
+      it 'raises error for empty string output_format' do
+        expect do
+          described_class.new(output_format: '')
+        end.to raise_error(ArgumentError, /Invalid output_format/)
+      end
+      it 'raises error for empty string result_format' do
+        expect do
+          described_class.new(result_format: '')
+        end.to raise_error(ArgumentError, /Invalid result_format/)
+      end
+      it 'raises error for whitespace in output_format' do
+        expect do
+          described_class.new(output_format: '  plain  ')
+        end.to raise_error(ArgumentError, /Invalid output_format/)
+      end
+      it 'normalizes case in output_format' do
+        config = described_class.new(output_format: 'MarkDown')
+        expect(config.output_format).to eq 'markdown'
+      end
+      it 'raises error for custom string in result_format' do
+        expect do
+          described_class.new(result_format: 'custom_format')
+        end.to raise_error(ArgumentError, /Invalid result_format/)
+      end
+    end
+    describe 'integration with other config fields' do
+      it 'works with output_format and chunking together' do
+        config = described_class.new(
+          output_format: 'markdown',
+          chunking: { max_chars: 500 }
+        )
+        expect(config.output_format).to eq 'markdown'
+        expect(config.chunking.max_chars).to eq 500
+      end
+      it 'works with result_format and OCR together' do
+        config = described_class.new(
+          result_format: 'element_based',
+          ocr: { backend: 'tesseract' }
+        )
+        expect(config.result_format).to eq 'element_based'
+        expect(config.ocr.backend).to eq 'tesseract'
+      end
+      it 'works with both formats and language detection' do
+        config = described_class.new(
+          output_format: 'html',
+          result_format: 'unified',
+          language_detection: { enabled: true }
+        )
+        expect(config.output_format).to eq 'html'
+        expect(config.result_format).to eq 'unified'
+        expect(config.language_detection.enabled).to be true
+      end
+      it 'preserves formats in complex config merge' do
+        base = described_class.new(
+          output_format: 'markdown',
+          result_format: 'unified',
+          chunking: { max_chars: 500 },
+          ocr: { backend: 'tesseract' }
+        )
+        override = described_class.new(
+          output_format: 'djot',
+          chunking: { max_chars: 750 }
+        )
+        merged = base.merge(override)
+        expect(merged.output_format).to eq 'djot'
+        expect(merged.result_format).to eq 'unified'
+        expect(merged.chunking.max_chars).to eq 750
+        expect(merged.ocr.backend).to eq 'tesseract'
+      end
+    end
+    describe 'allowed keys integration' do
+      it 'includes output_format in ALLOWED_KEYS' do
+        expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:output_format)
+      end
+      it 'includes result_format in ALLOWED_KEYS' do
+        expect(Kreuzberg::Config::Extraction::ALLOWED_KEYS).to include(:result_format)
+      end
+    end
+  end
+end
+# rubocop:enable RSpec/RepeatedExample

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
 resolver = "2"
 [workspace.package]
-version = "4.1.2"
+version = "4.2.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.1.2"
+version = "4.2.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]