RubyGems - kreuzberg - Versions diffs - 4.1.2 → 4.2.1 - Mend

kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
data/kreuzberg.gemspec +13 -1
data/lib/kreuzberg/cli.rb +16 -6
data/lib/kreuzberg/cli_proxy.rb +3 -1
data/lib/kreuzberg/config.rb +121 -39
data/lib/kreuzberg/djot_content.rb +225 -0
data/lib/kreuzberg/extraction_api.rb +20 -4
data/lib/kreuzberg/result.rb +12 -2
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +1 -0
data/sig/kreuzberg.rbs +28 -12
data/spec/binding/batch_operations_spec.rb +80 -0
data/spec/binding/batch_spec.rb +6 -5
data/spec/binding/error_recovery_spec.rb +3 -3
data/spec/binding/metadata_types_spec.rb +77 -57
data/spec/binding/tables_spec.rb +11 -2
data/spec/serialization_spec.rb +134 -0
data/spec/unit/config/output_format_spec.rb +380 -0
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +1 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/startup.rs +15 -1
data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
data/vendor/kreuzberg/src/core/io.rs +7 -7
data/vendor/kreuzberg/src/core/mime.rs +4 -4
data/vendor/kreuzberg/src/embeddings.rs +4 -4
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
data/vendor/kreuzberg/src/mcp/format.rs +237 -39
data/vendor/kreuzberg/src/mcp/params.rs +26 -33
data/vendor/kreuzberg/src/mcp/server.rs +6 -3
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
data/vendor/kreuzberg/tests/api_embed.rs +84 -50
data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
data/vendor/kreuzberg/tests/api_tests.rs +298 -139
data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
data/vendor/kreuzberg/tests/config_features.rs +19 -15
data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
data/vendor/kreuzberg/tests/core_integration.rs +57 -57
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
data/vendor/kreuzberg/tests/email_integration.rs +7 -7
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/error_handling.rs +13 -11
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/page_markers.rs +1 -1
data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
data/vendor/kreuzberg/tests/security_validation.rs +20 -19
data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +12 -2

data/lib/kreuzberg/djot_content.rb ADDED Viewed

@@ -0,0 +1,225 @@
+# frozen_string_literal: true
+begin
+  require 'json'
+rescue LoadError
+  require 'json/pure'
+end
+module Kreuzberg
+  class Result
+    # Djot structured content representation
+    #
+    # Represents document content in Djot format with structured metadata about
+    # blocks, images, links, footnotes, and other document elements.
+    #
+    class DjotContent
+      attr_reader :plain_text, :blocks, :metadata_json, :tables, :images, :links, :footnotes, :attributes
+      # Represents a formatted block in Djot content
+      class FormattedBlock
+        attr_reader :block_type, :children, :attributes, :content, :level
+        # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+        def initialize(hash_or_type = nil, children: nil, attributes: nil, content: nil, level: nil, block_type: nil)
+          if hash_or_type.is_a?(Hash)
+            # Initialize from hash
+            @block_type = hash_or_type[:block_type] || hash_or_type['block_type'] || ''
+            @children = hash_or_type[:children] || hash_or_type['children']
+            @attributes = hash_or_type[:attributes] || hash_or_type['attributes'] || {}
+            @content = hash_or_type[:content] || hash_or_type['content']
+            @level = hash_or_type[:level] || hash_or_type['level']
+          else
+            # Initialize from keyword arguments (for backward compatibility)
+            @block_type = block_type || hash_or_type || ''
+            @children = children || []
+            @attributes = attributes || {}
+            @content = content
+            @level = level
+          end
+        end
+        # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+        def to_h
+          {
+            block_type: @block_type,
+            children: @children,
+            attributes: @attributes,
+            content: @content,
+            level: @level
+          }.compact
+        end
+      end
+      # Represents an image in Djot content
+      class DjotImage
+        attr_reader :url, :alt, :title, :width, :height
+        alias src url
+        # rubocop:disable Metrics/CyclomaticComplexity
+        def initialize(hash_or_url = nil, alt: nil, title: nil, width: nil, height: nil, url: nil, src: nil)
+          if hash_or_url.is_a?(Hash)
+            # Initialize from hash (supports both 'url' and 'src' keys)
+            @url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:src] || hash_or_url['src']
+            @alt = hash_or_url[:alt] || hash_or_url['alt']
+            @title = hash_or_url[:title] || hash_or_url['title']
+            @width = hash_or_url[:width] || hash_or_url['width']
+            @height = hash_or_url[:height] || hash_or_url['height']
+          else
+            # Initialize from keyword arguments
+            @url = url || src || hash_or_url
+            @alt = alt
+            @title = title
+            @width = width
+            @height = height
+          end
+        end
+        # rubocop:enable Metrics/CyclomaticComplexity
+        def to_h
+          {
+            url: @url,
+            alt: @alt,
+            title: @title,
+            width: @width,
+            height: @height
+          }.compact
+        end
+      end
+      # Represents a link in Djot content
+      class DjotLink
+        attr_reader :url, :text, :title, :link_type
+        alias href url
+        # rubocop:disable Metrics/CyclomaticComplexity
+        def initialize(hash_or_url = nil, text: nil, title: nil, url: nil, href: nil, link_type: nil)
+          if hash_or_url.is_a?(Hash)
+            # Initialize from hash (supports both 'url' and 'href' keys)
+            @url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:href] || hash_or_url['href']
+            @text = hash_or_url[:text] || hash_or_url['text']
+            @title = hash_or_url[:title] || hash_or_url['title']
+            @link_type = hash_or_url[:link_type] || hash_or_url['link_type']
+          else
+            # Initialize from keyword arguments
+            @url = url || href || hash_or_url
+            @text = text
+            @title = title
+            @link_type = link_type
+          end
+        end
+        # rubocop:enable Metrics/CyclomaticComplexity
+        def to_h
+          {
+            url: @url,
+            text: @text,
+            title: @title,
+            link_type: @link_type
+          }.compact
+        end
+      end
+      # Represents a footnote in Djot content
+      class Footnote
+        attr_reader :label, :content
+        def initialize(label:, content:)
+          @label = label
+          @content = content
+        end
+        def to_h
+          {
+            label: @label,
+            content: @content
+          }
+        end
+      end
+      # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+      def initialize(hash)
+        @plain_text = hash['plain_text'] || hash[:plain_text] || ''
+        @blocks = parse_blocks(hash['blocks'] || hash[:blocks] || [])
+        @metadata_json = hash['metadata_json'] || hash[:metadata_json] || '{}'
+        @tables = hash['tables'] || hash[:tables] || []
+        @images = parse_images(hash['images'] || hash[:images] || [])
+        @links = parse_links(hash['links'] || hash[:links] || [])
+        @footnotes = parse_footnotes(hash['footnotes'] || hash[:footnotes] || [])
+        @attributes = hash['attributes'] || hash[:attributes] || {}
+      end
+      # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+      def metadata
+        @metadata ||= parse_metadata(@metadata_json)
+      end
+      def to_h
+        {
+          plain_text: @plain_text,
+          blocks: @blocks.map(&:to_h),
+          metadata_json: @metadata_json,
+          tables: @tables,
+          images: @images.map(&:to_h),
+          links: @links.map(&:to_h),
+          footnotes: @footnotes.map(&:to_h),
+          attributes: @attributes
+        }
+      end
+      private
+      def parse_metadata(metadata_json)
+        JSON.parse(metadata_json)
+      rescue JSON::ParserError
+        {}
+      end
+      def parse_blocks(blocks_data)
+        blocks_data.map do |block|
+          FormattedBlock.new(
+            block_type: block['block_type'] || block[:block_type] || '',
+            children: block['children'] || block[:children],
+            attributes: block['attributes'] || block[:attributes]
+          )
+        end
+      end
+      # rubocop:disable Metrics/CyclomaticComplexity
+      def parse_images(images_data)
+        images_data.map do |image|
+          DjotImage.new(
+            url: image['url'] || image[:url] || image['src'] || image[:src],
+            alt: image['alt'] || image[:alt],
+            title: image['title'] || image[:title],
+            width: image['width'] || image[:width],
+            height: image['height'] || image[:height]
+          )
+        end
+      end
+      # rubocop:enable Metrics/CyclomaticComplexity
+      # rubocop:disable Metrics/CyclomaticComplexity
+      def parse_links(links_data)
+        links_data.map do |link|
+          DjotLink.new(
+            url: link['url'] || link[:url] || link['href'] || link[:href],
+            text: link['text'] || link[:text],
+            title: link['title'] || link[:title],
+            link_type: link['link_type'] || link[:link_type]
+          )
+        end
+      end
+      # rubocop:enable Metrics/CyclomaticComplexity
+      def parse_footnotes(footnotes_data)
+        footnotes_data.map do |note|
+          Footnote.new(
+            label: note['label'] || note[:label],
+            content: note['content'] || note[:content]
+          )
+        end
+      end
+    end
+  end
+end

data/lib/kreuzberg/extraction_api.rb CHANGED Viewed

@@ -15,11 +15,15 @@ module Kreuzberg
     # @example Extract with explicit MIME type
     # @example Extract with OCR enabled
     def extract_file_sync(path:, mime_type: nil, config: nil)
+      # Validate that the file exists
+      path_str = path.to_s
+      raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
       opts = normalize_config(config)
       hash = if mime_type
-               native_extract_file_sync(path.to_s, mime_type.to_s, **opts)
+               native_extract_file_sync(path_str, mime_type.to_s, **opts)
              else
-               native_extract_file_sync(path.to_s, **opts)
+               native_extract_file_sync(path_str, **opts)
              end
       result = Result.new(hash)
       record_cache_entry!(result, opts)
@@ -53,6 +57,8 @@ module Kreuzberg
     #   response = HTTParty.get("https://example.com/document.docx")
     #   result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
     def extract_bytes_sync(data:, mime_type:, config: nil)
+      raise TypeError, "mime_type must be a String, got #{mime_type.inspect}" if mime_type.nil?
       opts = normalize_config(config)
       hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
       result = Result.new(hash)
@@ -92,6 +98,12 @@ module Kreuzberg
     #   config = Kreuzberg::Config::Extraction.new(force_ocr: true)
     #   results = Kreuzberg.batch_extract_files_sync(paths, config: config)
     def batch_extract_files_sync(paths:, config: nil)
+      # Validate that all files exist
+      paths.each do |path|
+        path_str = path.to_s
+        raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
+      end
       opts = normalize_config(config)
       hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
       results = hashes.map { |hash| Result.new(hash) }
@@ -130,11 +142,15 @@ module Kreuzberg
     #   )
     #   result = Kreuzberg.extract_file("document.pdf", config: config)
     def extract_file(path:, mime_type: nil, config: nil)
+      # Validate that the file exists
+      path_str = path.to_s
+      raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
       opts = normalize_config(config)
       hash = if mime_type
-               native_extract_file(path.to_s, mime_type.to_s, **opts)
+               native_extract_file(path_str, mime_type.to_s, **opts)
              else
-               native_extract_file(path.to_s, **opts)
+               native_extract_file(path_str, **opts)
              end
       result = Result.new(hash)
       record_cache_entry!(result, opts)

data/lib/kreuzberg/result.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Kreuzberg
   # rubocop:disable Metrics/ClassLength
   class Result
     attr_reader :content, :mime_type, :metadata, :metadata_json, :tables,
-                :detected_languages, :chunks, :images, :pages, :elements
+                :detected_languages, :chunks, :images, :pages, :elements, :djot_content
     # @!attribute [r] cells
     #   @return [Array<Array<String>>] Table cells (2D array)
@@ -180,6 +180,7 @@ module Kreuzberg
     #
     # @param hash [Hash] Hash returned from native extension
     #
+    # rubocop:disable Metrics/AbcSize
     def initialize(hash)
       @content = get_value(hash, 'content', '')
       @mime_type = get_value(hash, 'mime_type', '')
@@ -191,7 +192,9 @@ module Kreuzberg
       @images = parse_images(get_value(hash, 'images'))
       @pages = parse_pages(get_value(hash, 'pages'))
       @elements = parse_elements(get_value(hash, 'elements'))
+      @djot_content = parse_djot_content(get_value(hash, 'djot_content'))
     end
+    # rubocop:enable Metrics/AbcSize
     # Convert to hash
     #
@@ -207,7 +210,8 @@ module Kreuzberg
         chunks: serialize_chunks,
         images: serialize_images,
         pages: serialize_pages,
-        elements: serialize_elements
+        elements: serialize_elements,
+        djot_content: @djot_content&.to_h
       }
     end
@@ -434,6 +438,12 @@ module Kreuzberg
         y1: coordinates_data['y1'].to_f
       )
     end
+    def parse_djot_content(djot_data)
+      return nil if djot_data.nil?
+      DjotContent.new(djot_data)
+    end
   end
   # rubocop:enable Metrics/ClassLength
 end

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.1.2'
+  VERSION = '4.2.1'
 end

data/lib/kreuzberg.rb CHANGED Viewed

@@ -87,6 +87,7 @@ end
 require_relative 'kreuzberg/cache_api'
 require_relative 'kreuzberg/extraction_api'
+require_relative 'kreuzberg/djot_content'
 Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
 Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)

data/sig/kreuzberg.rbs CHANGED Viewed

@@ -202,6 +202,8 @@ module Kreuzberg
       attr_reader html_options: HtmlOptions?
       attr_reader pages: PageConfig?
       attr_reader max_concurrent_extractions: Integer?
+      attr_reader output_format: String?
+      attr_reader result_format: String?
       def self.from_file: (String path) -> Extraction
       def initialize: (
@@ -219,7 +221,9 @@ module Kreuzberg
         ?keywords: (Keywords | Hash[Symbol, untyped])?,
         ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
         ?pages: (PageConfig | Hash[Symbol, untyped])?,
-        ?max_concurrent_extractions: Integer?
+        ?max_concurrent_extractions: Integer?,
+        ?output_format: String?,
+        ?result_format: String?
       ) -> void
       def to_h: () -> Hash[Symbol, untyped]
@@ -413,14 +417,23 @@ module Kreuzberg
       attr_reader plain_text: String
       attr_reader blocks: Array[DjotContent::FormattedBlock]
       attr_reader metadata: Hash[untyped, untyped]
-      attr_reader tables: Array[Table]
+      attr_reader metadata_json: String
+      attr_reader tables: Array[untyped]
       attr_reader images: Array[DjotContent::DjotImage]
       attr_reader links: Array[DjotContent::DjotLink]
       attr_reader footnotes: Array[DjotContent::Footnote]
       attr_reader attributes: Hash[String, untyped]?
-      def initialize: (djot_content_hash hash) -> void
-      def to_h: () -> djot_content_hash
+      def initialize: (untyped hash) -> void
+      def to_h: () -> Hash[Symbol, untyped]
+      private
+      def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
+      def parse_blocks: (Array[untyped] blocks_data) -> Array[FormattedBlock]
+      def parse_images: (Array[untyped] images_data) -> Array[DjotImage]
+      def parse_links: (Array[untyped] links_data) -> Array[DjotLink]
+      def parse_footnotes: (Array[untyped] footnotes_data) -> Array[Footnote]
       class FormattedBlock
         attr_reader block_type: String
@@ -429,28 +442,31 @@ module Kreuzberg
         attr_reader children: Array[FormattedBlock]?
         attr_reader attributes: Hash[String, untyped]?
-        def initialize: (formatted_block_hash hash) -> void
-        def to_h: () -> formatted_block_hash
+        def initialize: (?untyped hash_or_type, ?children: untyped, ?attributes: untyped, ?content: untyped, ?level: untyped, ?block_type: untyped) -> void
+        def to_h: () -> Hash[Symbol, untyped]
       end
       class DjotImage
         attr_reader url: String
         attr_reader alt: String?
         attr_reader title: String?
-        attr_reader attributes: Hash[String, untyped]?
+        attr_reader width: Integer?
+        attr_reader height: Integer?
-        def initialize: (djot_image_hash hash) -> void
-        def to_h: () -> djot_image_hash
+        def initialize: (?untyped hash_or_url, ?alt: untyped, ?title: untyped, ?width: untyped, ?height: untyped, ?url: untyped, ?src: untyped) -> void
+        def src: () -> String
+        def to_h: () -> Hash[Symbol, untyped]
       end
       class DjotLink
         attr_reader url: String
-        attr_reader text: String
+        attr_reader text: String?
         attr_reader title: String?
         attr_reader link_type: String?
-        def initialize: (djot_link_hash hash) -> void
-        def to_h: () -> djot_link_hash
+        def initialize: (?untyped hash_or_url, ?text: untyped, ?title: untyped, ?url: untyped, ?href: untyped, ?link_type: untyped) -> void
+        def href: () -> String
+        def to_h: () -> Hash[Symbol, untyped]
       end
       class Footnote

data/spec/binding/batch_operations_spec.rb CHANGED Viewed

@@ -592,4 +592,84 @@ RSpec.describe 'Batch Operations' do
       paths.each { |p| FileUtils.rm_f(p) }
     end
   end
+  describe 'batch with output and result formats' do
+    it 'batch processes with output_format' do
+      paths = []
+      file = Tempfile.new(['format_test', '.txt']).tap do |f|
+        f.write('Test content for output format')
+        f.close
+      end
+      paths << file.path
+      config = Kreuzberg::Config::Extraction.new(output_format: 'markdown')
+      results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
+      expect(results).to be_an Array
+      expect(results.length).to eq 1
+      expect(results[0]).to be_a Kreuzberg::Result
+      paths.each { |p| FileUtils.rm_f(p) }
+    end
+    it 'batch processes with result_format' do
+      paths = []
+      file = Tempfile.new(['format_test', '.txt']).tap do |f|
+        f.write('Test content for result format')
+        f.close
+      end
+      paths << file.path
+      config = Kreuzberg::Config::Extraction.new(result_format: 'unified')
+      results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
+      expect(results).to be_an Array
+      expect(results.length).to eq 1
+      expect(results[0]).to be_a Kreuzberg::Result
+      paths.each { |p| FileUtils.rm_f(p) }
+    end
+    it 'batch processes with both output and result formats' do
+      paths = []
+      file = Tempfile.new(['format_test', '.txt']).tap do |f|
+        f.write('Test content for both formats')
+        f.close
+      end
+      paths << file.path
+      config = Kreuzberg::Config::Extraction.new(
+        output_format: 'plain',
+        result_format: 'element_based'
+      )
+      results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
+      expect(results).to be_an Array
+      expect(results.length).to eq 1
+      expect(results[0]).to be_a Kreuzberg::Result
+      paths.each { |p| FileUtils.rm_f(p) }
+    end
+    it 'batch processes with chunking and output_format' do
+      paths = []
+      file = Tempfile.new(['format_test', '.txt']).tap do |f|
+        f.write('Test content ' * 100)
+        f.close
+      end
+      paths << file.path
+      config = Kreuzberg::Config::Extraction.new(
+        output_format: 'markdown',
+        chunking: { max_chars: 1000 }
+      )
+      results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
+      expect(results).to be_an Array
+      expect(results.length).to eq 1
+      expect(results[0]).to be_a Kreuzberg::Result
+      paths.each { |p| FileUtils.rm_f(p) }
+    end
+  end
 end

data/spec/binding/batch_spec.rb CHANGED Viewed

@@ -295,7 +295,7 @@ RSpec.describe Kreuzberg do
   end
   describe 'batch error handling' do
-    it 'handles missing files gracefully in batch' do
+    it 'raises IOError for missing files in batch' do
       paths = [
         '/nonexistent/file1.txt',
         '/nonexistent/file2.txt'
@@ -303,10 +303,10 @@ RSpec.describe Kreuzberg do
       expect do
         described_class.batch_extract_files_sync(paths: paths)
-      end.not_to raise_error
+      end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
     end
-    it 'handles mixed valid and invalid paths' do
+    it 'raises IOError when batch contains invalid paths' do
       paths = []
       temp_dir = Dir.mktmpdir
@@ -316,8 +316,9 @@ RSpec.describe Kreuzberg do
       paths << '/nonexistent/invalid.txt'
-      results = described_class.batch_extract_files_sync(paths: paths)
-      expect(results).to be_a(Array)
+      expect do
+        described_class.batch_extract_files_sync(paths: paths)
+      end.to raise_error(Kreuzberg::Errors::IOError, /not found/)
     ensure
       FileUtils.remove_entry(temp_dir)
     end

data/spec/binding/error_recovery_spec.rb CHANGED Viewed

@@ -57,7 +57,7 @@ RSpec.describe 'Error Recovery' do
       nonexistent_path = '/nonexistent/file/that/does/not/exist.pdf'
       expect { Kreuzberg.extract_file_sync(path: nonexistent_path, config: config) }
-        .to raise_error(Kreuzberg::Errors::ValidationError, /not found|does not exist|no such file/)
+        .to raise_error(Kreuzberg::Errors::IOError, /not found|does not exist|no such file/)
     end
     it 'provides descriptive error messages for invalid MIME types' do
@@ -293,7 +293,7 @@ RSpec.describe 'Error Recovery' do
       expect(validation_error).to be_a(ArgumentError)
-      # Runtime error (file not found)
+      # Runtime error (file not found) - IOError since the file doesn't exist
       runtime_error = nil
       begin
         Kreuzberg.extract_file_sync(path: '/nonexistent/file.pdf')
@@ -301,7 +301,7 @@ RSpec.describe 'Error Recovery' do
         runtime_error = e
       end
-      expect(runtime_error).to be_a(Kreuzberg::Errors::ValidationError)
+      expect(runtime_error).to be_a(Kreuzberg::Errors::IOError)
     end
     it 'provides error recovery suggestions in messages' do