RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.8 → 4.0.0.pre.rc.11 - Mend

kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

checksums.yaml +4 -4
data/Gemfile.lock +12 -9
data/README.md +22 -0
data/ext/kreuzberg_rb/native/Cargo.lock +397 -177
data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
data/kreuzberg.gemspec +34 -2
data/lib/kreuzberg/cache_api.rb +35 -0
data/lib/kreuzberg/error_context.rb +49 -1
data/lib/kreuzberg/extraction_api.rb +255 -0
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +6 -0
data/lib/libpdfium.dylib +0 -0
data/sig/kreuzberg.rbs +9 -0
data/vendor/Cargo.toml +44 -0
data/vendor/kreuzberg/Cargo.toml +61 -38
data/vendor/kreuzberg/README.md +36 -27
data/vendor/kreuzberg/build.rs +197 -245
data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
data/vendor/kreuzberg/src/embeddings.rs +71 -3
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/html.rs +37 -5
data/vendor/kreuzberg/src/extractors/pdf.rs +93 -44
data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
data/vendor/kreuzberg/src/pdf/bundled.rs +19 -1
data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
data/vendor/kreuzberg/src/pdf/table.rs +3 -0
data/vendor/kreuzberg/src/pdf/text.rs +2 -2
data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
data/vendor/kreuzberg/tests/format_integration.rs +4 -1
data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
data/vendor/kreuzberg-ffi/README.md +851 -0
data/vendor/kreuzberg-ffi/build.rs +176 -0
data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
data/vendor/kreuzberg-tesseract/LICENSE +22 -0
data/vendor/kreuzberg-tesseract/README.md +399 -0
data/vendor/kreuzberg-tesseract/build.rs +1354 -0
data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
metadata +39 -3
data/vendor/rb-sys/bin/release.sh +0 -21

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
 [package]
 name = "kreuzberg-rb"
-version = "4.0.0-rc.8"
+version = "4.0.0-rc.11"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -29,7 +29,7 @@ default = []
 [dependencies]
 async-trait = "0.1.89"
-kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full", "embeddings"] }
+kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full"] }
 kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi", features = ["embeddings"] }
 magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
     "rb-sys",
@@ -48,7 +48,7 @@ tokio = { version = "1.48.0", features = [
     "time",
     "io-util",
 ] }
-html-to-markdown-rs = { version = "2.14.1", default-features = false }
+html-to-markdown-rs = { version = "2.14.2", default-features = false }
 [dev-dependencies]
 pretty_assertions = "1.4"

data/ext/kreuzberg_rb/native/src/lib.rs CHANGED Viewed

@@ -23,7 +23,9 @@ use kreuzberg::{
 use magnus::exception::ExceptionClass;
 use magnus::r_hash::ForEach;
 use magnus::value::ReprValue;
-use magnus::{Error, IntoValue, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
+use magnus::{
+    Error, IntoValue, RArray, RHash, RString, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args,
+};
 use std::fs;
 use std::path::{Path, PathBuf};
@@ -52,10 +54,15 @@ impl Drop for GcGuardedValue {
     }
 }
-unsafe extern "C" {
-    fn kreuzberg_last_error_code() -> i32;
-    fn kreuzberg_last_panic_context() -> *const std::ffi::c_char;
-    fn kreuzberg_free_string(s: *mut std::ffi::c_char);
+use std::ffi::c_char;
+// These C ABI functions are provided by the kreuzberg-ffi crate
+// We declare them here to ensure proper linking on all platforms
+#[link(name = "kreuzberg_ffi", kind = "static")]
+extern "C" {
+    pub fn kreuzberg_last_error_code() -> i32;
+    pub fn kreuzberg_last_panic_context() -> *mut c_char;
+    pub fn kreuzberg_free_string(s: *mut c_char);
 }
 /// Retrieve panic context from FFI if available
@@ -1797,13 +1804,16 @@ fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
 ///
 fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
     let ruby = Ruby::get().expect("Ruby not initialized");
-    let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
+    let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
     let (data, mime_type) = args.required;
     let opts = Some(args.keywords);
     let config = parse_extraction_config(&ruby, opts)?;
-    let result = kreuzberg::extract_bytes_sync(data.as_bytes(), &mime_type, &config).map_err(kreuzberg_error)?;
+    // SAFETY: we hold `data` for the duration of the call and do not re-enter Ruby while
+    // borrowing its bytes, so Ruby cannot mutate/free this string during extraction.
+    let bytes = unsafe { data.as_slice() };
+    let result = kreuzberg::extract_bytes_sync(bytes, &mime_type, &config).map_err(kreuzberg_error)?;
     extraction_result_to_ruby(&ruby, result)
 }
@@ -1877,7 +1887,7 @@ fn extract_file(args: &[Value]) -> Result<RHash, Error> {
 ///
 fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
     let ruby = Ruby::get().expect("Ruby not initialized");
-    let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
+    let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
     let (data, mime_type) = args.required;
     let opts = Some(args.keywords);
@@ -1886,8 +1896,11 @@ fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
     let runtime =
         tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
+    // SAFETY: we hold `data` for the duration of the call and do not re-enter Ruby while
+    // borrowing its bytes, so Ruby cannot mutate/free this string during extraction.
+    let bytes = unsafe { data.as_slice() };
     let result = runtime
-        .block_on(async { kreuzberg::extract_bytes(data.as_bytes(), &mime_type, &config).await })
+        .block_on(async { kreuzberg::extract_bytes(bytes, &mime_type, &config).await })
         .map_err(kreuzberg_error)?;
     extraction_result_to_ruby(&ruby, result)
@@ -1944,7 +1957,10 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
     let config = parse_extraction_config(&ruby, opts)?;
-    let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
+    let bytes_vec: Vec<RString> = bytes_array
+        .into_iter()
+        .map(RString::try_convert)
+        .collect::<Result<_, _>>()?;
     let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
     if bytes_vec.len() != mime_types.len() {
@@ -1955,10 +1971,12 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
         )));
     }
+    // SAFETY: we hold `bytes_vec` for the duration of the call and do not re-enter Ruby while
+    // borrowing its bytes, so Ruby cannot mutate/free these strings during extraction.
     let contents: Vec<(&[u8], &str)> = bytes_vec
         .iter()
         .zip(mime_types.iter())
-        .map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
+        .map(|(bytes, mime)| (unsafe { bytes.as_slice() }, mime.as_str()))
         .collect();
     let results = kreuzberg::batch_extract_bytes_sync(contents, &config).map_err(kreuzberg_error)?;
@@ -1986,7 +2004,10 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
     let config = parse_extraction_config(&ruby, opts)?;
-    let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
+    let bytes_vec: Vec<RString> = bytes_array
+        .into_iter()
+        .map(RString::try_convert)
+        .collect::<Result<_, _>>()?;
     let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
     if bytes_vec.len() != mime_types.len() {
@@ -1997,10 +2018,12 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
         )));
     }
+    // SAFETY: we hold `bytes_vec` for the duration of the call and do not re-enter Ruby while
+    // borrowing its bytes, so Ruby cannot mutate/free these strings during extraction.
     let contents: Vec<(&[u8], &str)> = bytes_vec
         .iter()
         .zip(mime_types.iter())
-        .map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
+        .map(|(bytes, mime)| (unsafe { bytes.as_slice() }, mime.as_str()))
         .collect();
     let runtime =

data/kreuzberg.gemspec CHANGED Viewed

@@ -71,7 +71,16 @@ fallback_files = Dir.chdir(__dir__) do
        .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
   end
-  ruby_fallback + core_fallback + ffi_fallback
+  tesseract_fallback = Dir.chdir(repo_root) do
+    Dir.glob('crates/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
+       .reject { |f| File.directory?(f) }
+       .reject { |f| f.include?('/target/') }
+       .grep_v(/\.(swp|bak|tmp)$/)
+       .grep_v(/~$/)
+       .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
+  end
+  ruby_fallback + core_fallback + ffi_fallback + tesseract_fallback
 end
 # Check for vendored crates (copied during CI/packaging)
@@ -98,6 +107,16 @@ vendor_files = Dir.chdir(__dir__) do
                           []
                         end
+  kreuzberg_tesseract_files = if Dir.exist?('vendor/kreuzberg-tesseract')
+                                Dir.glob('vendor/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
+                                   .reject { |f| File.directory?(f) }
+                                   .reject { |f| f.include?('/target/') }
+                                   .grep_v(/\.(swp|bak|tmp)$/)
+                                   .grep_v(/~$/)
+                              else
+                                []
+                              end
   rb_sys_files = if Dir.exist?('vendor/rb-sys')
                    Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
                       .reject { |f| File.directory?(f) }
@@ -114,7 +133,7 @@ vendor_files = Dir.chdir(__dir__) do
                      []
                    end
-  kreuzberg_files + kreuzberg_ffi_files + rb_sys_files + workspace_toml
+  kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files + rb_sys_files + workspace_toml
 end
 # Use git-tracked files if available, otherwise fallback to glob
@@ -127,8 +146,21 @@ files = if (ruby_files + core_files + ffi_files).empty?
           ruby_files + core_files + ffi_files
         end
+# Include built native artifacts when present (untracked by git)
+# This enables shipping precompiled gems from CI without committing binaries.
+native_artifacts = Dir.chdir(__dir__) do
+  Dir.glob(%w[
+             lib/**/*.bundle
+             lib/**/*.so
+             lib/**/*.dll
+             lib/**/*.dylib
+           ])
+end
+files.concat(native_artifacts)
 # Filter to only include files that actually exist
 files = files.select { |f| File.exist?(f) }
+files = files.uniq
 Gem::Specification.new do |spec|
   spec.name = 'kreuzberg'

data/lib/kreuzberg/cache_api.rb CHANGED Viewed

@@ -2,12 +2,47 @@
 module Kreuzberg
   # Provides caching capabilities for extraction results.
+  #
+  # This module manages the cache for document extraction results. Results are cached
+  # based on document content, configuration, and MIME type, improving performance for
+  # repeated extractions of the same documents.
   module CacheAPI
+    # Clear all cached extraction results.
+    #
+    # Removes all entries from both the native Rust cache and the local tracking state.
+    # After calling this method, all extraction results will be recomputed on subsequent
+    # requests (unless caching is disabled).
+    #
+    # @return [void] No meaningful return value
+    #
+    # @example Clear cache
+    #   Kreuzberg.clear_cache
+    #   puts "Cache cleared"
     def clear_cache
       native_clear_cache
       reset_cache_tracker!
     end
+    # Retrieve cache statistics.
+    #
+    # Returns information about the current state of the extraction result cache,
+    # including the number of cached entries and total memory used. Statistics include
+    # both native Rust cache metrics and local tracker metrics.
+    #
+    # @return [Hash{Symbol | String => Integer}] Cache statistics hash containing:
+    #   - :total_entries [Integer] Total number of cached extraction results
+    #   - :total_size_bytes [Integer] Total memory used by cached results in bytes
+    #
+    # @example Get cache statistics
+    #   stats = Kreuzberg.cache_stats
+    #   puts "Cached entries: #{stats[:total_entries]}"
+    #   puts "Cache size: #{stats[:total_size_bytes]} bytes"
+    #
+    # @example Check if cache is full
+    #   stats = Kreuzberg.cache_stats
+    #   if stats[:total_size_bytes] > 1_000_000_000  # 1GB
+    #     Kreuzberg.clear_cache
+    #   end
     def cache_stats
       stats = native_cache_stats
       total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]

data/lib/kreuzberg/error_context.rb CHANGED Viewed

@@ -4,15 +4,50 @@ require 'json'
 module Kreuzberg
   # ErrorContext module provides access to FFI error introspection functions.
-  # Retrieve the last error code and panic context information from errors.
+  #
+  # This module retrieves detailed error and panic context information from the native
+  # Rust core. It allows inspection of the last error that occurred during extraction,
+  # including panic information with file, line, function, and timestamp details.
   module ErrorContext
     class << self
+      # Get the error code of the last operation.
+      #
+      # Returns the error code from the last FFI call. Returns 0 (SUCCESS) if no error
+      # occurred or if introspection fails.
+      #
+      # @return [Integer] Error code constant (ERROR_CODE_* values), or 0 on success
+      #
+      # @example Check last error
+      #   code = Kreuzberg::ErrorContext.last_error_code
+      #   case code
+      #   when Kreuzberg::ERROR_CODE_IO
+      #     puts "I/O error occurred"
+      #   when Kreuzberg::ERROR_CODE_PARSING
+      #     puts "Parsing error occurred"
+      #   else
+      #     puts "Success or unknown error"
+      #   end
       def last_error_code
         Kreuzberg._last_error_code_native
       rescue StandardError
         0
       end
+      # Get panic context information from the last error.
+      #
+      # Returns a {Errors::PanicContext} object containing detailed information about
+      # the last panic that occurred in the Rust core. Includes file path, line number,
+      # function name, error message, and timestamp.
+      #
+      # @return [Errors::PanicContext, nil] Panic context if a panic occurred, nil otherwise
+      #
+      # @example Get panic details
+      #   panic = Kreuzberg::ErrorContext.last_panic_context
+      #   if panic
+      #     puts "Panic at #{panic.file}:#{panic.line} in #{panic.function}"
+      #     puts "Message: #{panic.message}"
+      #     puts "Time: #{panic.timestamp_secs}"
+      #   end
       def last_panic_context
         json_str = Kreuzberg._last_panic_context_json_native
         return nil unless json_str
@@ -22,6 +57,19 @@ module Kreuzberg
         nil
       end
+      # Get panic context as raw JSON string.
+      #
+      # Returns the panic context information as a JSON string for raw access or
+      # custom parsing. Returns nil if no panic has occurred.
+      #
+      # @return [String, nil] JSON-serialized panic context, or nil if no panic
+      #
+      # @example Get raw JSON panic context
+      #   json = Kreuzberg::ErrorContext.last_panic_context_json
+      #   if json
+      #     panic_data = JSON.parse(json)
+      #     puts panic_data
+      #   end
       def last_panic_context_json
         Kreuzberg._last_panic_context_json_native
       rescue StandardError

data/lib/kreuzberg/extraction_api.rb CHANGED Viewed

@@ -2,7 +2,45 @@
 module Kreuzberg
   # Provides extraction methods for documents and text.
+  #
+  # This module includes both synchronous and asynchronous methods for extracting
+  # content from files and byte data. Results are automatically cached based on
+  # configuration settings.
   module ExtractionAPI
+    # Synchronously extract content from a file.
+    #
+    # Performs document extraction including text, tables, metadata, and optionally
+    # images. Supports various file formats (PDF, DOCX, XLSX, images, HTML, etc.)
+    # based on the detected or specified MIME type.
+    #
+    # @param path [String, Pathname] Path to the document file to extract
+    # @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
+    #   If omitted, type is detected from file extension.
+    # @param config [Config::Extraction, Hash, nil] Extraction configuration controlling
+    #   OCR settings, chunking, image extraction, and more. Accepts either a {Config::Extraction}
+    #   object or a configuration hash.
+    #
+    # @return [Result] Extraction result containing content, metadata, tables, and images
+    #
+    # @raise [Errors::IOError] If the file cannot be read or access is denied
+    # @raise [Errors::ParsingError] If document parsing fails
+    # @raise [Errors::UnsupportedFormatError] If the file format is not supported
+    # @raise [Errors::OCRError] If OCR is enabled and fails
+    # @raise [Errors::MissingDependencyError] If a required dependency is missing
+    #
+    # @example Extract a PDF file
+    #   result = Kreuzberg.extract_file_sync("document.pdf")
+    #   puts result.content
+    #
+    # @example Extract with explicit MIME type
+    #   result = Kreuzberg.extract_file_sync("data.bin", mime_type: "application/pdf")
+    #
+    # @example Extract with OCR enabled
+    #   config = Kreuzberg::Config::Extraction.new(
+    #     force_ocr: true,
+    #     ocr: Kreuzberg::Config::OCR.new(language: "eng")
+    #   )
+    #   result = Kreuzberg.extract_file_sync("scanned.pdf", config: config)
     def extract_file_sync(path, mime_type: nil, config: nil)
       opts = normalize_config(config)
       hash = if mime_type
@@ -15,6 +53,32 @@ module Kreuzberg
       result
     end
+    # Synchronously extract content from byte data.
+    #
+    # Performs document extraction directly from binary data in memory. Useful for
+    # extracting content from files already loaded into memory or from network streams.
+    #
+    # @param data [String] Binary document data (can contain any byte values)
+    # @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
+    #   This parameter is mandatory to guide the extraction engine.
+    # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
+    #   either a {Config::Extraction} object or a configuration hash.
+    #
+    # @return [Result] Extraction result containing content, metadata, tables, and images
+    #
+    # @raise [Errors::ParsingError] If document parsing fails
+    # @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
+    # @raise [Errors::OCRError] If OCR is enabled and fails
+    # @raise [Errors::MissingDependencyError] If a required dependency is missing
+    #
+    # @example Extract PDF from memory
+    #   pdf_data = File.read("document.pdf", binmode: true)
+    #   result = Kreuzberg.extract_bytes_sync(pdf_data, "application/pdf")
+    #   puts result.content
+    #
+    # @example Extract from a network stream
+    #   response = HTTParty.get("https://example.com/document.docx")
+    #   result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
     def extract_bytes_sync(data, mime_type, config: nil)
       opts = normalize_config(config)
       hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
@@ -23,6 +87,37 @@ module Kreuzberg
       result
     end
+    # Synchronously extract content from multiple files.
+    #
+    # Processes multiple files in a single batch operation. Files are extracted sequentially,
+    # and results maintain the same order as the input paths. This is useful for bulk
+    # processing multiple documents with consistent configuration.
+    #
+    # @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
+    #   is converted to a string and MIME type is auto-detected from extension.
+    # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
+    #   Accepts either a {Config::Extraction} object or a configuration hash.
+    #
+    # @return [Array<Result>] Array of extraction results in the same order as input paths.
+    #   Array length matches the input paths length.
+    #
+    # @raise [Errors::IOError] If any file cannot be read
+    # @raise [Errors::ParsingError] If any document parsing fails
+    # @raise [Errors::UnsupportedFormatError] If any file format is not supported
+    # @raise [Errors::OCRError] If OCR is enabled and fails on any document
+    # @raise [Errors::MissingDependencyError] If a required dependency is missing
+    #
+    # @example Batch extract multiple PDFs
+    #   paths = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
+    #   results = Kreuzberg.batch_extract_files_sync(paths)
+    #   results.each_with_index do |result, idx|
+    #     puts "File #{idx}: #{result.content.length} characters"
+    #   end
+    #
+    # @example Batch extract with consistent configuration
+    #   paths = Dir.glob("documents/*.pdf")
+    #   config = Kreuzberg::Config::Extraction.new(force_ocr: true)
+    #   results = Kreuzberg.batch_extract_files_sync(paths, config: config)
     def batch_extract_files_sync(paths, config: nil)
       opts = normalize_config(config)
       hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
@@ -31,6 +126,36 @@ module Kreuzberg
       results
     end
+    # Asynchronously extract content from a file.
+    #
+    # Non-blocking extraction that returns a {Result} promise. Extraction is performed
+    # in the background using native threads or the Tokio runtime. This method is
+    # preferred for I/O-bound operations and integrating with async workflows.
+    #
+    # @param path [String, Pathname] Path to the document file to extract
+    # @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
+    #   If omitted, type is detected from file extension.
+    # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
+    #   either a {Config::Extraction} object or a configuration hash.
+    #
+    # @return [Result] Extraction result containing content, metadata, tables, and images.
+    #   In async contexts, this result is available upon method return.
+    #
+    # @raise [Errors::IOError] If the file cannot be read or access is denied
+    # @raise [Errors::ParsingError] If document parsing fails
+    # @raise [Errors::UnsupportedFormatError] If the file format is not supported
+    # @raise [Errors::OCRError] If OCR is enabled and fails
+    # @raise [Errors::MissingDependencyError] If a required dependency is missing
+    #
+    # @example Extract a PDF file asynchronously
+    #   result = Kreuzberg.extract_file("large_document.pdf")
+    #   puts result.content
+    #
+    # @example Extract with custom OCR configuration
+    #   config = Kreuzberg::Config::Extraction.new(
+    #     ocr: Kreuzberg::Config::OCR.new(language: "deu")
+    #   )
+    #   result = Kreuzberg.extract_file("document.pdf", config: config)
     def extract_file(path, mime_type: nil, config: nil)
       opts = normalize_config(config)
       hash = if mime_type
@@ -43,6 +168,36 @@ module Kreuzberg
       result
     end
+    # Asynchronously extract content from byte data.
+    #
+    # Non-blocking extraction from in-memory binary data. Like {#extract_file},
+    # this performs extraction in the background, making it suitable for handling
+    # high-volume extraction workloads without blocking the main thread.
+    #
+    # @param data [String] Binary document data (can contain any byte values)
+    # @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
+    #   This parameter is mandatory to guide the extraction engine.
+    # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
+    #   either a {Config::Extraction} object or a configuration hash.
+    #
+    # @return [Result] Extraction result containing content, metadata, tables, and images
+    #
+    # @raise [Errors::ParsingError] If document parsing fails
+    # @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
+    # @raise [Errors::OCRError] If OCR is enabled and fails
+    # @raise [Errors::MissingDependencyError] If a required dependency is missing
+    #
+    # @example Extract PDF from memory asynchronously
+    #   pdf_data = File.read("document.pdf", binmode: true)
+    #   result = Kreuzberg.extract_bytes(pdf_data, "application/pdf")
+    #   puts result.content
+    #
+    # @example Extract with image extraction
+    #   data = File.read("file.docx", binmode: true)
+    #   config = Kreuzberg::Config::Extraction.new(
+    #     image_extraction: Kreuzberg::Config::ImageExtraction.new(extract_images: true)
+    #   )
+    #   result = Kreuzberg.extract_bytes(data, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", config: config)
     def extract_bytes(data, mime_type, config: nil)
       opts = normalize_config(config)
       hash = native_extract_bytes(data.to_s, mime_type.to_s, **opts)
@@ -51,6 +206,39 @@ module Kreuzberg
       result
     end
+    # Asynchronously extract content from multiple files.
+    #
+    # Non-blocking batch extraction from multiple files. Results maintain the same order
+    # as input paths. This is the preferred method for bulk processing when non-blocking
+    # I/O is required (e.g., in web servers or async applications).
+    #
+    # @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
+    #   is converted to a string and MIME type is auto-detected from extension.
+    # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
+    #   Accepts either a {Config::Extraction} object or a configuration hash.
+    #
+    # @return [Array<Result>] Array of extraction results in the same order as input paths.
+    #   Array length matches the input paths length.
+    #
+    # @raise [Errors::IOError] If any file cannot be read
+    # @raise [Errors::ParsingError] If any document parsing fails
+    # @raise [Errors::UnsupportedFormatError] If any file format is not supported
+    # @raise [Errors::OCRError] If OCR is enabled and fails on any document
+    # @raise [Errors::MissingDependencyError] If a required dependency is missing
+    #
+    # @example Batch extract multiple files asynchronously
+    #   paths = ["invoice_1.pdf", "invoice_2.pdf", "invoice_3.pdf"]
+    #   results = Kreuzberg.batch_extract_files(paths)
+    #   results.each_with_index do |result, idx|
+    #     puts "Invoice #{idx}: #{result.detected_languages}"
+    #   end
+    #
+    # @example Batch extract with chunking
+    #   paths = Dir.glob("reports/*.docx")
+    #   config = Kreuzberg::Config::Extraction.new(
+    #     chunking: Kreuzberg::Config::Chunking.new(max_chars: 1000, max_overlap: 200)
+    #   )
+    #   results = Kreuzberg.batch_extract_files(paths, config: config)
     def batch_extract_files(paths, config: nil)
       opts = normalize_config(config)
       hashes = native_batch_extract_files(paths.map(&:to_s), **opts)
@@ -59,6 +247,37 @@ module Kreuzberg
       results
     end
+    # Synchronously extract content from multiple byte data sources.
+    #
+    # Processes multiple in-memory binary documents in a single batch operation. Results
+    # maintain the same order as the input data array. The mime_types array must have
+    # the same length as the data_array.
+    #
+    # @param data_array [Array<String>] Array of binary document data. Each element can
+    #   contain any byte values (e.g., PDF binary data).
+    # @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
+    #   Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
+    # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
+    #   Accepts either a {Config::Extraction} object or a configuration hash.
+    #
+    # @return [Array<Result>] Array of extraction results in the same order as input data.
+    #   Array length matches the data_array length.
+    #
+    # @raise [ArgumentError] If data_array and mime_types have different lengths
+    # @raise [Errors::ParsingError] If any document parsing fails
+    # @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
+    # @raise [Errors::OCRError] If OCR is enabled and fails on any document
+    # @raise [Errors::MissingDependencyError] If a required dependency is missing
+    #
+    # @example Batch extract binary documents
+    #   pdf_data_1 = File.read("doc1.pdf", binmode: true)
+    #   pdf_data_2 = File.read("doc2.pdf", binmode: true)
+    #   docx_data = File.read("report.docx", binmode: true)
+    #
+    #   data = [pdf_data_1, pdf_data_2, docx_data]
+    #   types = ["application/pdf", "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
+    #   results = Kreuzberg.batch_extract_bytes_sync(data, types)
+    #   results.each { |r| puts r.content }
     def batch_extract_bytes_sync(data_array, mime_types, config: nil)
       opts = normalize_config(config)
       hashes = native_batch_extract_bytes_sync(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
@@ -67,6 +286,42 @@ module Kreuzberg
       results
     end
+    # Asynchronously extract content from multiple byte data sources.
+    #
+    # Non-blocking batch extraction from multiple in-memory binary documents. Results
+    # maintain the same order as the input data array. This method is preferred when
+    # processing multiple documents without blocking (e.g., handling multiple uploads).
+    #
+    # @param data_array [Array<String>] Array of binary document data. Each element can
+    #   contain any byte values (e.g., PDF binary data).
+    # @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
+    #   Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
+    # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
+    #   Accepts either a {Config::Extraction} object or a configuration hash.
+    #
+    # @return [Array<Result>] Array of extraction results in the same order as input data.
+    #   Array length matches the data_array length.
+    #
+    # @raise [ArgumentError] If data_array and mime_types have different lengths
+    # @raise [Errors::ParsingError] If any document parsing fails
+    # @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
+    # @raise [Errors::OCRError] If OCR is enabled and fails on any document
+    # @raise [Errors::MissingDependencyError] If a required dependency is missing
+    #
+    # @example Batch extract uploaded documents asynchronously
+    #   # From a web request with multiple file uploads
+    #   uploaded_files = params[:files]  # Array of uploaded file objects
+    #   data = uploaded_files.map(&:read)
+    #   types = uploaded_files.map(&:content_type)
+    #
+    #   results = Kreuzberg.batch_extract_bytes(data, types)
+    #   results.each { |r| puts r.content }
+    #
+    # @example Batch extract with OCR
+    #   data = [scan_1_bytes, scan_2_bytes, scan_3_bytes]
+    #   types = ["image/png", "image/png", "image/png"]
+    #   config = Kreuzberg::Config::Extraction.new(force_ocr: true)
+    #   results = Kreuzberg.batch_extract_bytes(data, types, config: config)
     def batch_extract_bytes(data_array, mime_types, config: nil)
       opts = normalize_config(config)
       hashes = native_batch_extract_bytes(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.0.0-rc.8'
+  VERSION = '4.0.0-rc.11'
 end

data/lib/kreuzberg.rb CHANGED Viewed

@@ -23,6 +23,12 @@ module Kreuzberg
   # Alias for API consistency with other language bindings
   ExtractionConfig = Config::Extraction
+  PageConfig = Config::PageConfig
+  module KeywordAlgorithm
+    YAKE = :yake
+    RAKE = :rake
+  end
   @__cache_tracker = { entries: 0, bytes: 0 }

data/lib/libpdfium.dylib ADDED Viewed

Binary file