RubyGems - kreuzberg - Versions diffs - 4.1.1 → 4.2.0 - Mend

kreuzberg 4.1.1 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -4
data/README.md +8 -5
data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
data/kreuzberg.gemspec +14 -2
data/lib/kreuzberg/api_proxy.rb +0 -1
data/lib/kreuzberg/cli_proxy.rb +0 -1
data/lib/kreuzberg/config.rb +70 -35
data/lib/kreuzberg/mcp_proxy.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +5 -1
data/spec/binding/batch_operations_spec.rb +80 -0
data/spec/binding/metadata_types_spec.rb +77 -57
data/spec/serialization_spec.rb +134 -0
data/spec/unit/config/output_format_spec.rb +380 -0
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +3 -3
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/embeddings.rs +4 -4
data/vendor/kreuzberg/src/mcp/format.rs +237 -39
data/vendor/kreuzberg/src/mcp/params.rs +26 -33
data/vendor/kreuzberg/src/mcp/server.rs +6 -3
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
data/vendor/kreuzberg/tests/api_embed.rs +84 -50
data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
data/vendor/kreuzberg/tests/api_tests.rs +298 -139
data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
data/vendor/kreuzberg/tests/config_features.rs +19 -15
data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
data/vendor/kreuzberg/tests/core_integration.rs +55 -53
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
data/vendor/kreuzberg/tests/email_integration.rs +7 -7
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/error_handling.rs +13 -11
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/page_markers.rs +1 -1
data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
data/vendor/kreuzberg/tests/security_validation.rs +20 -19
data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
data/vendor/kreuzberg-tesseract/build.rs +4 -4
data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
metadata +13 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0c1c0519fb3a58c45ec553994bd982b4f284835bd35ea0758461f6f381accfd6
-  data.tar.gz: 161c18cfabdd20bdaa520abda521cb16072dcc00f5fd2e41152d9da4acdb9d08
+  metadata.gz: 9a1c9adffca7d75c142bd661f1d481b1aee00d97c6f62dcc70292f37978bcc17
+  data.tar.gz: 227af2ed45bff1dfa9afebd69220d15a41b2e476bf97f8a83173d21aab8b88e1
 SHA512:
-  metadata.gz: 7c6e1768022dcfdef5eaaaa3557a8388e8ad45158a69ed022d852b07f7658cbb885ca7860fa32eda8f29b5f7ea4216f93033aad77614afb82578b9157ed92710
-  data.tar.gz: 96ca3f1f3c6d6f9ea6dc826f7704c1d741ffaab24f524f099c4eb294652211ac6b9f324874974bb24d417193e76b1bce951315330d6f1ccd8eb5b014fa7fc71f
+  metadata.gz: 0d1b0081f89a73f5422e68a714fc415f6d290dd8be7cf0ba6d454cfdf1938ebdac4919358b25d6e5a0bc1a209e1b165062a0341d28cde1b3fa0595bffec837f5
+  data.tar.gz: fc5a5f29309c29fbbf63ba035cf5e462e78b15c2afc239fb333bd8b6e70ef061219822ed4d533f81bb35cdd62db84da8c01e8f172561b0f7fb802b848b491c0a

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    kreuzberg (4.1.1)
+    kreuzberg (4.2.0)
 GEM
   remote: https://rubygems.org/
@@ -121,7 +121,7 @@ GEM
       rubocop (~> 1.81)
     ruby-progressbar (1.13.0)
     securerandom (0.4.1)
-    sorbet-runtime (0.6.12894)
+    sorbet-runtime (0.6.12897)
     steep (1.10.0)
       activesupport (>= 5.1)
       concurrent-ruby (>= 1.1.10)
@@ -207,7 +207,7 @@ CHECKSUMS
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
   json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
-  kreuzberg (4.1.1)
+  kreuzberg (4.2.0)
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -242,7 +242,7 @@ CHECKSUMS
   rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
   ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
   securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
-  sorbet-runtime (0.6.12894) sha256=4f0cbe041d80dac973ec3a5a848679922074dd77cc19f46384b27a8b9ff4a90c
+  sorbet-runtime (0.6.12897) sha256=0348ab8803c4c3646977fee298083ded9b7e74d5b34b50c567c63eb7e36eb286
   steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
   strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
   terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.1" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.0" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -86,10 +86,13 @@ gem 'kreuzberg'
 ### System Requirements
-- **Ruby 2.7+** required
+- **Ruby 3.2.0 or higher** required (including Ruby 4.x)
+- Ruby 4.0+ is fully supported with no code changes required
 - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
 - Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
+**Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
 ## Quick Start
@@ -202,9 +205,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
 ## Features
-### Supported File Formats (56+)
+### Supported File Formats (57+)
-56 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
+57 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
 #### Office Documents
@@ -230,7 +233,7 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
 |----------|---------|----------|
 | **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
 | **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
-| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
+| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
 #### Email & Archives

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -31,7 +31,7 @@ embeddings = ["kreuzberg/embeddings"]
 [dependencies]
 async-trait = "0.1.89"
-kreuzberg = { path = "../../../../../crates/kreuzberg", default-features = false, features = [
+kreuzberg = { path = "../../../vendor/kreuzberg", default-features = false, features = [
     "pdf",
     "excel",
     "office",
@@ -51,7 +51,7 @@ kreuzberg = { path = "../../../../../crates/kreuzberg", default-features = false
     "bundled-pdfium",
     "tokio-runtime",
 ] }
-kreuzberg-ffi = { path = "../../../../../crates/kreuzberg-ffi" }
+kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
 magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
     "rb-sys",
 ] }

data/ext/kreuzberg_rb/native/libpdfium.so ADDED Viewed

Binary file

data/ext/kreuzberg_rb/native/src/config/types.rs CHANGED Viewed

@@ -1025,8 +1025,10 @@ pub fn config_from_file(path: String) -> Result<RHash, Error> {
         .and_then(|v| magnus::RHash::try_convert(v).map_err(|_| validation_error("Config must be a Hash")))
 }
-/// Discover extraction config from current directory
+/// Discover extraction config from current directory or parent directories
 pub fn config_discover() -> Result<Value, Error> {
+    use std::path::PathBuf;
     let ruby = Ruby::get().expect("Ruby not initialized");
     // Search for config files in order of precedence
@@ -1038,19 +1040,27 @@ pub fn config_discover() -> Result<Value, Error> {
         (".kreuzbergrc", "json"),
     ];
-    for (name, format) in config_files {
-        if let Ok(content) = fs::read_to_string(name) {
-            let json_value: serde_json::Value = match format {
-                "toml" => toml::from_str(&content)
-                    .map_err(|e| validation_error(format!("Invalid TOML in {}: {}", name, e)))?,
-                "yaml" => serde_yaml_ng::from_str(&content)
-                    .map_err(|e| validation_error(format!("Invalid YAML in {}: {}", name, e)))?,
-                "json" => serde_json::from_str(&content)
-                    .map_err(|e| validation_error(format!("Invalid JSON in {}: {}", name, e)))?,
-                _ => unreachable!(),
-            };
-            return json_value_to_ruby(&ruby, &json_value);
+    // Start from current directory and search up to parent directories
+    let mut current_dir: Option<PathBuf> = std::env::current_dir().ok();
+    while let Some(dir) = current_dir {
+        for (name, format) in &config_files {
+            let config_path = dir.join(name);
+            if let Ok(content) = fs::read_to_string(&config_path) {
+                let json_value: serde_json::Value = match *format {
+                    "toml" => toml::from_str(&content)
+                        .map_err(|e| validation_error(format!("Invalid TOML in {}: {}", config_path.display(), e)))?,
+                    "yaml" => serde_yaml_ng::from_str(&content)
+                        .map_err(|e| validation_error(format!("Invalid YAML in {}: {}", config_path.display(), e)))?,
+                    "json" => serde_json::from_str(&content)
+                        .map_err(|e| validation_error(format!("Invalid JSON in {}: {}", config_path.display(), e)))?,
+                    _ => unreachable!(),
+                };
+                return json_value_to_ruby(&ruby, &json_value);
+            }
         }
+        // Move to parent directory
+        current_dir = dir.parent().map(|p| p.to_path_buf());
     }
     // Return nil if no config found

data/kreuzberg.gemspec CHANGED Viewed

@@ -130,10 +130,22 @@ vendor_files = Dir.chdir(__dir__) do
   kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files + rb_sys_files + workspace_toml
 end
+# When vendor files exist, get ext/ files from filesystem (to include modified Cargo.toml
+# with vendor paths) instead of from git (which has original 5-level crate paths)
+ext_files_from_fs = Dir.chdir(__dir__) do
+  Dir.glob('ext/**/*', File::FNM_DOTMATCH)
+     .reject { |f| File.directory?(f) }
+     .reject { |f| f.include?('/target/') }
+     .grep_v(/\.(swp|bak|tmp)$/)
+     .grep_v(/~$/)
+end
 files = if (ruby_files + core_files + ffi_files).empty?
           fallback_files
         elsif vendor_files.any?
-          ruby_files + vendor_files
+          # Use ext/ files from filesystem (modified by vendor script) + non-ext ruby files from git
+          non_ext_ruby_files = ruby_files.reject { |f| f.start_with?('ext/') }
+          non_ext_ruby_files + ext_files_from_fs + vendor_files
         else
           ruby_files + core_files + ffi_files
         end
@@ -165,7 +177,7 @@ Gem::Specification.new do |spec|
   DESC
   spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
   spec.license = 'MIT'
-  spec.required_ruby_version = '>= 3.2.0'
+  spec.required_ruby_version = '>= 3.2.0', '< 5.0'
   spec.metadata = {
     'homepage_uri' => spec.homepage,

data/lib/kreuzberg/api_proxy.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 # frozen_string_literal: true
 require 'open3'
-require 'pathname'
 module Kreuzberg
   # @example Start the server

data/lib/kreuzberg/cli_proxy.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 # frozen_string_literal: true
 require 'open3'
-require 'pathname'
 module Kreuzberg
   # @example

data/lib/kreuzberg/config.rb CHANGED Viewed

@@ -717,7 +717,7 @@ module Kreuzberg
                   :ocr, :chunking, :language_detection, :pdf_options,
                   :image_extraction, :image_preprocessing, :postprocessor,
                   :token_reduction, :keywords, :html_options, :pages,
-                  :max_concurrent_extractions
+                  :max_concurrent_extractions, :output_format, :result_format
       # Load configuration from a file.
       #
@@ -738,7 +738,7 @@ module Kreuzberg
         use_cache enable_quality_processing force_ocr ocr chunking
         language_detection pdf_options image_extraction image_preprocessing
         postprocessor token_reduction keywords html_options pages
-        max_concurrent_extractions
+        max_concurrent_extractions output_format result_format
       ].freeze
       # Aliases for backward compatibility
@@ -789,41 +789,67 @@ module Kreuzberg
         new(**normalize_hash_keys(hash))
       end
-      def initialize(
-        use_cache: true,
-        enable_quality_processing: false,
-        force_ocr: false,
-        ocr: nil,
-        chunking: nil,
-        language_detection: nil,
-        pdf_options: nil,
-        image_extraction: nil,
-        image_preprocessing: nil,
-        postprocessor: nil,
-        token_reduction: nil,
-        keywords: nil,
-        html_options: nil,
-        pages: nil,
-        max_concurrent_extractions: nil
-      )
-        @use_cache = use_cache ? true : false
-        @enable_quality_processing = enable_quality_processing ? true : false
-        @force_ocr = force_ocr ? true : false
-        @ocr = normalize_config(ocr, OCR)
-        @chunking = normalize_config(chunking, Chunking)
-        @language_detection = normalize_config(language_detection, LanguageDetection)
-        @pdf_options = normalize_config(pdf_options, PDF)
-        @image_extraction = normalize_config(image_extraction, ImageExtraction)
-        @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
-        @postprocessor = normalize_config(postprocessor, PostProcessor)
-        @token_reduction = normalize_config(token_reduction, TokenReduction)
-        @keywords = normalize_config(keywords, Keywords)
-        @html_options = normalize_config(html_options, HtmlOptions)
-        @pages = normalize_config(pages, PageConfig)
-        @max_concurrent_extractions = max_concurrent_extractions&.to_i
+      def initialize(hash = nil,
+                     use_cache: true,
+                     enable_quality_processing: false,
+                     force_ocr: false,
+                     ocr: nil,
+                     chunking: nil,
+                     language_detection: nil,
+                     pdf_options: nil,
+                     image_extraction: nil,
+                     image_preprocessing: nil,
+                     postprocessor: nil,
+                     token_reduction: nil,
+                     keywords: nil,
+                     html_options: nil,
+                     pages: nil,
+                     max_concurrent_extractions: nil,
+                     output_format: nil,
+                     result_format: nil)
+        kwargs = {
+          use_cache: use_cache, enable_quality_processing: enable_quality_processing,
+          force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
+          pdf_options: pdf_options, image_extraction: image_extraction,
+          image_preprocessing: image_preprocessing, postprocessor: postprocessor,
+          token_reduction: token_reduction, keywords: keywords, html_options: html_options,
+          pages: pages, max_concurrent_extractions: max_concurrent_extractions,
+          output_format: output_format, result_format: result_format
+        }
+        extracted = extract_from_hash(hash, kwargs)
+        assign_attributes(extracted)
+      end
+      def extract_from_hash(hash, defaults)
+        return defaults unless hash.is_a?(Hash)
+        hash = hash.transform_keys(&:to_sym)
+        defaults.merge(hash.slice(*defaults.keys))
+      end
+      def assign_attributes(params)
+        @use_cache = params[:use_cache] ? true : false
+        @enable_quality_processing = params[:enable_quality_processing] ? true : false
+        @force_ocr = params[:force_ocr] ? true : false
+        @ocr = normalize_config(params[:ocr], OCR)
+        @chunking = normalize_config(params[:chunking], Chunking)
+        @language_detection = normalize_config(params[:language_detection], LanguageDetection)
+        @pdf_options = normalize_config(params[:pdf_options], PDF)
+        @image_extraction = normalize_config(params[:image_extraction], ImageExtraction)
+        @image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
+        @postprocessor = normalize_config(params[:postprocessor], PostProcessor)
+        @token_reduction = normalize_config(params[:token_reduction], TokenReduction)
+        @keywords = normalize_config(params[:keywords], Keywords)
+        @html_options = normalize_config(params[:html_options], HtmlOptions)
+        @pages = normalize_config(params[:pages], PageConfig)
+        @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
+        @output_format = params[:output_format]&.to_s
+        @result_format = params[:result_format]&.to_s
       end
       # rubocop:disable Metrics/CyclomaticComplexity
+      # rubocop:disable Metrics/MethodLength
       def to_h
         {
           use_cache: @use_cache,
@@ -840,9 +866,12 @@ module Kreuzberg
           keywords: @keywords&.to_h,
           html_options: @html_options&.to_h,
           pages: @pages&.to_h,
-          max_concurrent_extractions: @max_concurrent_extractions
+          max_concurrent_extractions: @max_concurrent_extractions,
+          output_format: @output_format,
+          result_format: @result_format
         }.compact
       end
+      # rubocop:enable Metrics/MethodLength
       # rubocop:enable Metrics/CyclomaticComplexity
       # Serialize configuration to JSON string
@@ -981,6 +1010,10 @@ module Kreuzberg
           @pages = normalize_config(value, PageConfig)
         when :max_concurrent_extractions
           @max_concurrent_extractions = value&.to_i
+        when :output_format
+          @output_format = value&.to_s
+        when :result_format
+          @result_format = value&.to_s
         else
           raise ArgumentError, "Unknown configuration key: #{key}"
         end
@@ -1028,6 +1061,8 @@ module Kreuzberg
         @html_options = merged.html_options
         @pages = merged.pages
         @max_concurrent_extractions = merged.max_concurrent_extractions
+        @output_format = merged.output_format
+        @result_format = merged.result_format
       end
     end
   end

data/lib/kreuzberg/mcp_proxy.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 # frozen_string_literal: true
 require 'open3'
-require 'pathname'
 require 'json'
 module Kreuzberg

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.1.1'
+  VERSION = '4.2.0'
 end

data/sig/kreuzberg.rbs CHANGED Viewed

@@ -202,6 +202,8 @@ module Kreuzberg
       attr_reader html_options: HtmlOptions?
       attr_reader pages: PageConfig?
       attr_reader max_concurrent_extractions: Integer?
+      attr_reader output_format: String?
+      attr_reader result_format: String?
       def self.from_file: (String path) -> Extraction
       def initialize: (
@@ -219,7 +221,9 @@ module Kreuzberg
         ?keywords: (Keywords | Hash[Symbol, untyped])?,
         ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
         ?pages: (PageConfig | Hash[Symbol, untyped])?,
-        ?max_concurrent_extractions: Integer?
+        ?max_concurrent_extractions: Integer?,
+        ?output_format: String?,
+        ?result_format: String?
       ) -> void
       def to_h: () -> Hash[Symbol, untyped]

data/spec/binding/batch_operations_spec.rb CHANGED Viewed

@@ -592,4 +592,84 @@ RSpec.describe 'Batch Operations' do
       paths.each { |p| FileUtils.rm_f(p) }
     end
   end
+  describe 'batch with output and result formats' do
+    it 'batch processes with output_format' do
+      paths = []
+      file = Tempfile.new(['format_test', '.txt']).tap do |f|
+        f.write('Test content for output format')
+        f.close
+      end
+      paths << file.path
+      config = Kreuzberg::Config::Extraction.new(output_format: 'markdown')
+      results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
+      expect(results).to be_an Array
+      expect(results.length).to eq 1
+      expect(results[0]).to be_a Kreuzberg::Result
+      paths.each { |p| FileUtils.rm_f(p) }
+    end
+    it 'batch processes with result_format' do
+      paths = []
+      file = Tempfile.new(['format_test', '.txt']).tap do |f|
+        f.write('Test content for result format')
+        f.close
+      end
+      paths << file.path
+      config = Kreuzberg::Config::Extraction.new(result_format: 'unified')
+      results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
+      expect(results).to be_an Array
+      expect(results.length).to eq 1
+      expect(results[0]).to be_a Kreuzberg::Result
+      paths.each { |p| FileUtils.rm_f(p) }
+    end
+    it 'batch processes with both output and result formats' do
+      paths = []
+      file = Tempfile.new(['format_test', '.txt']).tap do |f|
+        f.write('Test content for both formats')
+        f.close
+      end
+      paths << file.path
+      config = Kreuzberg::Config::Extraction.new(
+        output_format: 'plain',
+        result_format: 'element_based'
+      )
+      results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
+      expect(results).to be_an Array
+      expect(results.length).to eq 1
+      expect(results[0]).to be_a Kreuzberg::Result
+      paths.each { |p| FileUtils.rm_f(p) }
+    end
+    it 'batch processes with chunking and output_format' do
+      paths = []
+      file = Tempfile.new(['format_test', '.txt']).tap do |f|
+        f.write('Test content ' * 100)
+        f.close
+      end
+      paths << file.path
+      config = Kreuzberg::Config::Extraction.new(
+        output_format: 'markdown',
+        chunking: { max_chars: 1000 }
+      )
+      results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
+      expect(results).to be_an Array
+      expect(results.length).to eq 1
+      expect(results[0]).to be_a Kreuzberg::Result
+      paths.each { |p| FileUtils.rm_f(p) }
+    end
+  end
 end

data/spec/binding/metadata_types_spec.rb CHANGED Viewed

@@ -1154,64 +1154,11 @@ RSpec.describe 'Kreuzberg Metadata Types' do
   describe 'Thread Safety: Concurrent Extraction' do
     it 'handles concurrent extraction safely' do
-      test_files = []
-      results = []
-      errors = []
+      test_files = create_concurrent_test_files
+      results, errors = run_concurrent_extractions(test_files)
-      5.times do |i|
-        html_content = <<~HTML
-          <html>
-          <head>
-            <title>Concurrent Test #{i}</title>
-            <meta name="description" content="Test document #{i}">
-            <meta name="keywords" content="test#{i}, concurrent, thread-safe">
-          </head>
-          <body>
-            <h1>Test Document #{i}</h1>
-            <p>Content for test #{i}</p>
-            <a href="/page-#{i}">Link #{i}</a>
-            <img src="image-#{i}.jpg" alt="Image #{i}">
-          </body>
-          </html>
-        HTML
-        test_files << create_test_html_file(html_content)
-      end
-      begin
-        threads = test_files.map do |file|
-          Thread.new do
-            result = Kreuzberg.extract_file_sync(path: file)
-            results << result
-          rescue StandardError => e
-            errors << e
-          end
-        end
-        threads.each(&:join)
-        expect(errors).to be_empty
-        expect(results.length).to eq(5)
-        results.each do |result|
-          expect(result).to be_a(Kreuzberg::Result)
-          expect(result.metadata).not_to be_nil
-          metadata = result.metadata
-          next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
-          expect(metadata.title).not_to be_nil
-          expect(metadata.description).not_to be_nil
-          expect(metadata.keywords).to be_a(Array)
-          expect(metadata.headers).to be_a(Array)
-          expect(metadata.links).to be_a(Array)
-          expect(metadata.images).to be_a(Array)
-        end
-        titles = results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
-        expect(titles.uniq.length).to eq(5)
-      ensure
-        test_files.each { |f| FileUtils.rm_f(f) }
-      end
+      expect(results).not_to be_empty
+      verify_concurrent_results(results, errors, test_files)
     end
   end
@@ -1225,4 +1172,77 @@ RSpec.describe 'Kreuzberg Metadata Types' do
     file.close
     file.path
   end
+  def create_concurrent_test_files
+    test_files = []
+    5.times do |i|
+      html_content = <<~HTML
+        <html>
+        <head>
+          <title>Concurrent Test #{i}</title>
+          <meta name="description" content="Test document #{i}">
+          <meta name="keywords" content="test#{i}, concurrent, thread-safe">
+        </head>
+        <body>
+          <h1>Test Document #{i}</h1>
+          <p>Content for test #{i}</p>
+          <a href="/page-#{i}">Link #{i}</a>
+          <img src="image-#{i}.jpg" alt="Image #{i}">
+        </body>
+        </html>
+      HTML
+      test_files << create_test_html_file(html_content)
+    end
+    test_files
+  end
+  def run_concurrent_extractions(test_files)
+    results = []
+    errors = []
+    threads = test_files.map do |file|
+      Thread.new do
+        result = Kreuzberg.extract_file_sync(path: file)
+        results << result
+      rescue StandardError => e
+        errors << e
+      end
+    end
+    threads.each(&:join)
+    [results, errors]
+  end
+  def verify_concurrent_results(results, errors, test_files)
+    expect(errors).to be_empty
+    expect(results.length).to eq(5)
+    results.each do |result|
+      expect(result).to be_a(Kreuzberg::Result)
+      expect(result.metadata).not_to be_nil
+      metadata = result.metadata
+      next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
+      verify_metadata_fields(metadata)
+    end
+    titles = extract_titles(results)
+    expect(titles.uniq.length).to eq(5)
+  ensure
+    test_files.each { |f| FileUtils.rm_f(f) }
+  end
+  def verify_metadata_fields(metadata)
+    expect(metadata.title).not_to be_nil
+    expect(metadata.description).not_to be_nil
+    expect(metadata.keywords).to be_a(Array)
+    expect(metadata.headers).to be_a(Array)
+    expect(metadata.links).to be_a(Array)
+    expect(metadata.images).to be_a(Array)
+  end
+  def extract_titles(results)
+    results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
+  end
 end