kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +12 -9
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/Cargo.lock +397 -177
  5. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  6. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  7. data/kreuzberg.gemspec +34 -2
  8. data/lib/kreuzberg/cache_api.rb +35 -0
  9. data/lib/kreuzberg/error_context.rb +49 -1
  10. data/lib/kreuzberg/extraction_api.rb +255 -0
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +6 -0
  13. data/lib/libpdfium.dylib +0 -0
  14. data/sig/kreuzberg.rbs +9 -0
  15. data/vendor/Cargo.toml +44 -0
  16. data/vendor/kreuzberg/Cargo.toml +61 -38
  17. data/vendor/kreuzberg/README.md +36 -27
  18. data/vendor/kreuzberg/build.rs +197 -245
  19. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  20. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  21. data/vendor/kreuzberg/src/error.rs +1 -1
  22. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  23. data/vendor/kreuzberg/src/extractors/pdf.rs +93 -44
  24. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  25. data/vendor/kreuzberg/src/pdf/bundled.rs +19 -1
  26. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  27. data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
  28. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  29. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  30. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  31. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  32. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  33. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  34. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  35. data/vendor/kreuzberg-ffi/README.md +851 -0
  36. data/vendor/kreuzberg-ffi/build.rs +176 -0
  37. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  38. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  39. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  40. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  41. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  42. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  43. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  44. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  45. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  46. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  47. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  48. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  49. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  50. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  51. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  52. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  53. data/vendor/kreuzberg-tesseract/README.md +399 -0
  54. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  55. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  56. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  57. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  58. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  59. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  60. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  61. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  62. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  63. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  64. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  65. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  66. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  67. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  68. metadata +39 -3
  69. data/vendor/rb-sys/bin/release.sh +0 -21
@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
7
7
 
8
8
  [package]
9
9
  name = "kreuzberg-rb"
10
- version = "4.0.0-rc.8"
10
+ version = "4.0.0-rc.11"
11
11
  edition = "2024"
12
12
  rust-version = "1.91"
13
13
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -29,7 +29,7 @@ default = []
29
29
 
30
30
  [dependencies]
31
31
  async-trait = "0.1.89"
32
- kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full", "embeddings"] }
32
+ kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full"] }
33
33
  kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi", features = ["embeddings"] }
34
34
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
35
35
  "rb-sys",
@@ -48,7 +48,7 @@ tokio = { version = "1.48.0", features = [
48
48
  "time",
49
49
  "io-util",
50
50
  ] }
51
- html-to-markdown-rs = { version = "2.14.1", default-features = false }
51
+ html-to-markdown-rs = { version = "2.14.2", default-features = false }
52
52
 
53
53
  [dev-dependencies]
54
54
  pretty_assertions = "1.4"
@@ -23,7 +23,9 @@ use kreuzberg::{
23
23
  use magnus::exception::ExceptionClass;
24
24
  use magnus::r_hash::ForEach;
25
25
  use magnus::value::ReprValue;
26
- use magnus::{Error, IntoValue, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
26
+ use magnus::{
27
+ Error, IntoValue, RArray, RHash, RString, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args,
28
+ };
27
29
  use std::fs;
28
30
  use std::path::{Path, PathBuf};
29
31
 
@@ -52,10 +54,15 @@ impl Drop for GcGuardedValue {
52
54
  }
53
55
  }
54
56
 
55
- unsafe extern "C" {
56
- fn kreuzberg_last_error_code() -> i32;
57
- fn kreuzberg_last_panic_context() -> *const std::ffi::c_char;
58
- fn kreuzberg_free_string(s: *mut std::ffi::c_char);
57
+ use std::ffi::c_char;
58
+
59
+ // These C ABI functions are provided by the kreuzberg-ffi crate
60
+ // We declare them here to ensure proper linking on all platforms
61
+ #[link(name = "kreuzberg_ffi", kind = "static")]
62
+ extern "C" {
63
+ pub fn kreuzberg_last_error_code() -> i32;
64
+ pub fn kreuzberg_last_panic_context() -> *mut c_char;
65
+ pub fn kreuzberg_free_string(s: *mut c_char);
59
66
  }
60
67
 
61
68
  /// Retrieve panic context from FFI if available
@@ -1797,13 +1804,16 @@ fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
1797
1804
  ///
1798
1805
  fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
1799
1806
  let ruby = Ruby::get().expect("Ruby not initialized");
1800
- let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
1807
+ let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
1801
1808
  let (data, mime_type) = args.required;
1802
1809
  let opts = Some(args.keywords);
1803
1810
 
1804
1811
  let config = parse_extraction_config(&ruby, opts)?;
1805
1812
 
1806
- let result = kreuzberg::extract_bytes_sync(data.as_bytes(), &mime_type, &config).map_err(kreuzberg_error)?;
1813
+ // SAFETY: we hold `data` for the duration of the call and do not re-enter Ruby while
1814
+ // borrowing its bytes, so Ruby cannot mutate/free this string during extraction.
1815
+ let bytes = unsafe { data.as_slice() };
1816
+ let result = kreuzberg::extract_bytes_sync(bytes, &mime_type, &config).map_err(kreuzberg_error)?;
1807
1817
 
1808
1818
  extraction_result_to_ruby(&ruby, result)
1809
1819
  }
@@ -1877,7 +1887,7 @@ fn extract_file(args: &[Value]) -> Result<RHash, Error> {
1877
1887
  ///
1878
1888
  fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
1879
1889
  let ruby = Ruby::get().expect("Ruby not initialized");
1880
- let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
1890
+ let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
1881
1891
  let (data, mime_type) = args.required;
1882
1892
  let opts = Some(args.keywords);
1883
1893
 
@@ -1886,8 +1896,11 @@ fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
1886
1896
  let runtime =
1887
1897
  tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
1888
1898
 
1899
+ // SAFETY: we hold `data` for the duration of the call and do not re-enter Ruby while
1900
+ // borrowing its bytes, so Ruby cannot mutate/free this string during extraction.
1901
+ let bytes = unsafe { data.as_slice() };
1889
1902
  let result = runtime
1890
- .block_on(async { kreuzberg::extract_bytes(data.as_bytes(), &mime_type, &config).await })
1903
+ .block_on(async { kreuzberg::extract_bytes(bytes, &mime_type, &config).await })
1891
1904
  .map_err(kreuzberg_error)?;
1892
1905
 
1893
1906
  extraction_result_to_ruby(&ruby, result)
@@ -1944,7 +1957,10 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
1944
1957
 
1945
1958
  let config = parse_extraction_config(&ruby, opts)?;
1946
1959
 
1947
- let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
1960
+ let bytes_vec: Vec<RString> = bytes_array
1961
+ .into_iter()
1962
+ .map(RString::try_convert)
1963
+ .collect::<Result<_, _>>()?;
1948
1964
  let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
1949
1965
 
1950
1966
  if bytes_vec.len() != mime_types.len() {
@@ -1955,10 +1971,12 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
1955
1971
  )));
1956
1972
  }
1957
1973
 
1974
+ // SAFETY: we hold `bytes_vec` for the duration of the call and do not re-enter Ruby while
1975
+ // borrowing its bytes, so Ruby cannot mutate/free these strings during extraction.
1958
1976
  let contents: Vec<(&[u8], &str)> = bytes_vec
1959
1977
  .iter()
1960
1978
  .zip(mime_types.iter())
1961
- .map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
1979
+ .map(|(bytes, mime)| (unsafe { bytes.as_slice() }, mime.as_str()))
1962
1980
  .collect();
1963
1981
 
1964
1982
  let results = kreuzberg::batch_extract_bytes_sync(contents, &config).map_err(kreuzberg_error)?;
@@ -1986,7 +2004,10 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
1986
2004
 
1987
2005
  let config = parse_extraction_config(&ruby, opts)?;
1988
2006
 
1989
- let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
2007
+ let bytes_vec: Vec<RString> = bytes_array
2008
+ .into_iter()
2009
+ .map(RString::try_convert)
2010
+ .collect::<Result<_, _>>()?;
1990
2011
  let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
1991
2012
 
1992
2013
  if bytes_vec.len() != mime_types.len() {
@@ -1997,10 +2018,12 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
1997
2018
  )));
1998
2019
  }
1999
2020
 
2021
+ // SAFETY: we hold `bytes_vec` for the duration of the call and do not re-enter Ruby while
2022
+ // borrowing its bytes, so Ruby cannot mutate/free these strings during extraction.
2000
2023
  let contents: Vec<(&[u8], &str)> = bytes_vec
2001
2024
  .iter()
2002
2025
  .zip(mime_types.iter())
2003
- .map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
2026
+ .map(|(bytes, mime)| (unsafe { bytes.as_slice() }, mime.as_str()))
2004
2027
  .collect();
2005
2028
 
2006
2029
  let runtime =
data/kreuzberg.gemspec CHANGED
@@ -71,7 +71,16 @@ fallback_files = Dir.chdir(__dir__) do
71
71
  .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
72
72
  end
73
73
 
74
- ruby_fallback + core_fallback + ffi_fallback
74
+ tesseract_fallback = Dir.chdir(repo_root) do
75
+ Dir.glob('crates/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
76
+ .reject { |f| File.directory?(f) }
77
+ .reject { |f| f.include?('/target/') }
78
+ .grep_v(/\.(swp|bak|tmp)$/)
79
+ .grep_v(/~$/)
80
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
81
+ end
82
+
83
+ ruby_fallback + core_fallback + ffi_fallback + tesseract_fallback
75
84
  end
76
85
 
77
86
  # Check for vendored crates (copied during CI/packaging)
@@ -98,6 +107,16 @@ vendor_files = Dir.chdir(__dir__) do
98
107
  []
99
108
  end
100
109
 
110
+ kreuzberg_tesseract_files = if Dir.exist?('vendor/kreuzberg-tesseract')
111
+ Dir.glob('vendor/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
112
+ .reject { |f| File.directory?(f) }
113
+ .reject { |f| f.include?('/target/') }
114
+ .grep_v(/\.(swp|bak|tmp)$/)
115
+ .grep_v(/~$/)
116
+ else
117
+ []
118
+ end
119
+
101
120
  rb_sys_files = if Dir.exist?('vendor/rb-sys')
102
121
  Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
103
122
  .reject { |f| File.directory?(f) }
@@ -114,7 +133,7 @@ vendor_files = Dir.chdir(__dir__) do
114
133
  []
115
134
  end
116
135
 
117
- kreuzberg_files + kreuzberg_ffi_files + rb_sys_files + workspace_toml
136
+ kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files + rb_sys_files + workspace_toml
118
137
  end
119
138
 
120
139
  # Use git-tracked files if available, otherwise fallback to glob
@@ -127,8 +146,21 @@ files = if (ruby_files + core_files + ffi_files).empty?
127
146
  ruby_files + core_files + ffi_files
128
147
  end
129
148
 
149
+ # Include built native artifacts when present (untracked by git)
150
+ # This enables shipping precompiled gems from CI without committing binaries.
151
+ native_artifacts = Dir.chdir(__dir__) do
152
+ Dir.glob(%w[
153
+ lib/**/*.bundle
154
+ lib/**/*.so
155
+ lib/**/*.dll
156
+ lib/**/*.dylib
157
+ ])
158
+ end
159
+ files.concat(native_artifacts)
160
+
130
161
  # Filter to only include files that actually exist
131
162
  files = files.select { |f| File.exist?(f) }
163
+ files = files.uniq
132
164
 
133
165
  Gem::Specification.new do |spec|
134
166
  spec.name = 'kreuzberg'
@@ -2,12 +2,47 @@
2
2
 
3
3
  module Kreuzberg
4
4
  # Provides caching capabilities for extraction results.
5
+ #
6
+ # This module manages the cache for document extraction results. Results are cached
7
+ # based on document content, configuration, and MIME type, improving performance for
8
+ # repeated extractions of the same documents.
5
9
  module CacheAPI
10
+ # Clear all cached extraction results.
11
+ #
12
+ # Removes all entries from both the native Rust cache and the local tracking state.
13
+ # After calling this method, all extraction results will be recomputed on subsequent
14
+ # requests (unless caching is disabled).
15
+ #
16
+ # @return [void] No meaningful return value
17
+ #
18
+ # @example Clear cache
19
+ # Kreuzberg.clear_cache
20
+ # puts "Cache cleared"
6
21
  def clear_cache
7
22
  native_clear_cache
8
23
  reset_cache_tracker!
9
24
  end
10
25
 
26
+ # Retrieve cache statistics.
27
+ #
28
+ # Returns information about the current state of the extraction result cache,
29
+ # including the number of cached entries and total memory used. Statistics include
30
+ # both native Rust cache metrics and local tracker metrics.
31
+ #
32
+ # @return [Hash{Symbol | String => Integer}] Cache statistics hash containing:
33
+ # - :total_entries [Integer] Total number of cached extraction results
34
+ # - :total_size_bytes [Integer] Total memory used by cached results in bytes
35
+ #
36
+ # @example Get cache statistics
37
+ # stats = Kreuzberg.cache_stats
38
+ # puts "Cached entries: #{stats[:total_entries]}"
39
+ # puts "Cache size: #{stats[:total_size_bytes]} bytes"
40
+ #
41
+ # @example Check if cache is full
42
+ # stats = Kreuzberg.cache_stats
43
+ # if stats[:total_size_bytes] > 1_000_000_000 # 1GB
44
+ # Kreuzberg.clear_cache
45
+ # end
11
46
  def cache_stats
12
47
  stats = native_cache_stats
13
48
  total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]
@@ -4,15 +4,50 @@ require 'json'
4
4
 
5
5
  module Kreuzberg
6
6
  # ErrorContext module provides access to FFI error introspection functions.
7
- # Retrieve the last error code and panic context information from errors.
7
+ #
8
+ # This module retrieves detailed error and panic context information from the native
9
+ # Rust core. It allows inspection of the last error that occurred during extraction,
10
+ # including panic information with file, line, function, and timestamp details.
8
11
  module ErrorContext
9
12
  class << self
13
+ # Get the error code of the last operation.
14
+ #
15
+ # Returns the error code from the last FFI call. Returns 0 (SUCCESS) if no error
16
+ # occurred or if introspection fails.
17
+ #
18
+ # @return [Integer] Error code constant (ERROR_CODE_* values), or 0 on success
19
+ #
20
+ # @example Check last error
21
+ # code = Kreuzberg::ErrorContext.last_error_code
22
+ # case code
23
+ # when Kreuzberg::ERROR_CODE_IO
24
+ # puts "I/O error occurred"
25
+ # when Kreuzberg::ERROR_CODE_PARSING
26
+ # puts "Parsing error occurred"
27
+ # else
28
+ # puts "Success or unknown error"
29
+ # end
10
30
  def last_error_code
11
31
  Kreuzberg._last_error_code_native
12
32
  rescue StandardError
13
33
  0
14
34
  end
15
35
 
36
+ # Get panic context information from the last error.
37
+ #
38
+ # Returns a {Errors::PanicContext} object containing detailed information about
39
+ # the last panic that occurred in the Rust core. Includes file path, line number,
40
+ # function name, error message, and timestamp.
41
+ #
42
+ # @return [Errors::PanicContext, nil] Panic context if a panic occurred, nil otherwise
43
+ #
44
+ # @example Get panic details
45
+ # panic = Kreuzberg::ErrorContext.last_panic_context
46
+ # if panic
47
+ # puts "Panic at #{panic.file}:#{panic.line} in #{panic.function}"
48
+ # puts "Message: #{panic.message}"
49
+ # puts "Time: #{panic.timestamp_secs}"
50
+ # end
16
51
  def last_panic_context
17
52
  json_str = Kreuzberg._last_panic_context_json_native
18
53
  return nil unless json_str
@@ -22,6 +57,19 @@ module Kreuzberg
22
57
  nil
23
58
  end
24
59
 
60
+ # Get panic context as raw JSON string.
61
+ #
62
+ # Returns the panic context information as a JSON string for raw access or
63
+ # custom parsing. Returns nil if no panic has occurred.
64
+ #
65
+ # @return [String, nil] JSON-serialized panic context, or nil if no panic
66
+ #
67
+ # @example Get raw JSON panic context
68
+ # json = Kreuzberg::ErrorContext.last_panic_context_json
69
+ # if json
70
+ # panic_data = JSON.parse(json)
71
+ # puts panic_data
72
+ # end
25
73
  def last_panic_context_json
26
74
  Kreuzberg._last_panic_context_json_native
27
75
  rescue StandardError
@@ -2,7 +2,45 @@
2
2
 
3
3
  module Kreuzberg
4
4
  # Provides extraction methods for documents and text.
5
+ #
6
+ # This module includes both synchronous and asynchronous methods for extracting
7
+ # content from files and byte data. Results are automatically cached based on
8
+ # configuration settings.
5
9
  module ExtractionAPI
10
+ # Synchronously extract content from a file.
11
+ #
12
+ # Performs document extraction including text, tables, metadata, and optionally
13
+ # images. Supports various file formats (PDF, DOCX, XLSX, images, HTML, etc.)
14
+ # based on the detected or specified MIME type.
15
+ #
16
+ # @param path [String, Pathname] Path to the document file to extract
17
+ # @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
18
+ # If omitted, type is detected from file extension.
19
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration controlling
20
+ # OCR settings, chunking, image extraction, and more. Accepts either a {Config::Extraction}
21
+ # object or a configuration hash.
22
+ #
23
+ # @return [Result] Extraction result containing content, metadata, tables, and images
24
+ #
25
+ # @raise [Errors::IOError] If the file cannot be read or access is denied
26
+ # @raise [Errors::ParsingError] If document parsing fails
27
+ # @raise [Errors::UnsupportedFormatError] If the file format is not supported
28
+ # @raise [Errors::OCRError] If OCR is enabled and fails
29
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
30
+ #
31
+ # @example Extract a PDF file
32
+ # result = Kreuzberg.extract_file_sync("document.pdf")
33
+ # puts result.content
34
+ #
35
+ # @example Extract with explicit MIME type
36
+ # result = Kreuzberg.extract_file_sync("data.bin", mime_type: "application/pdf")
37
+ #
38
+ # @example Extract with OCR enabled
39
+ # config = Kreuzberg::Config::Extraction.new(
40
+ # force_ocr: true,
41
+ # ocr: Kreuzberg::Config::OCR.new(language: "eng")
42
+ # )
43
+ # result = Kreuzberg.extract_file_sync("scanned.pdf", config: config)
6
44
  def extract_file_sync(path, mime_type: nil, config: nil)
7
45
  opts = normalize_config(config)
8
46
  hash = if mime_type
@@ -15,6 +53,32 @@ module Kreuzberg
15
53
  result
16
54
  end
17
55
 
56
+ # Synchronously extract content from byte data.
57
+ #
58
+ # Performs document extraction directly from binary data in memory. Useful for
59
+ # extracting content from files already loaded into memory or from network streams.
60
+ #
61
+ # @param data [String] Binary document data (can contain any byte values)
62
+ # @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
63
+ # This parameter is mandatory to guide the extraction engine.
64
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
65
+ # either a {Config::Extraction} object or a configuration hash.
66
+ #
67
+ # @return [Result] Extraction result containing content, metadata, tables, and images
68
+ #
69
+ # @raise [Errors::ParsingError] If document parsing fails
70
+ # @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
71
+ # @raise [Errors::OCRError] If OCR is enabled and fails
72
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
73
+ #
74
+ # @example Extract PDF from memory
75
+ # pdf_data = File.read("document.pdf", binmode: true)
76
+ # result = Kreuzberg.extract_bytes_sync(pdf_data, "application/pdf")
77
+ # puts result.content
78
+ #
79
+ # @example Extract from a network stream
80
+ # response = HTTParty.get("https://example.com/document.docx")
81
+ # result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
18
82
  def extract_bytes_sync(data, mime_type, config: nil)
19
83
  opts = normalize_config(config)
20
84
  hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
@@ -23,6 +87,37 @@ module Kreuzberg
23
87
  result
24
88
  end
25
89
 
90
+ # Synchronously extract content from multiple files.
91
+ #
92
+ # Processes multiple files in a single batch operation. Files are extracted sequentially,
93
+ # and results maintain the same order as the input paths. This is useful for bulk
94
+ # processing multiple documents with consistent configuration.
95
+ #
96
+ # @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
97
+ # is converted to a string and MIME type is auto-detected from extension.
98
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
99
+ # Accepts either a {Config::Extraction} object or a configuration hash.
100
+ #
101
+ # @return [Array<Result>] Array of extraction results in the same order as input paths.
102
+ # Array length matches the input paths length.
103
+ #
104
+ # @raise [Errors::IOError] If any file cannot be read
105
+ # @raise [Errors::ParsingError] If any document parsing fails
106
+ # @raise [Errors::UnsupportedFormatError] If any file format is not supported
107
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
108
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
109
+ #
110
+ # @example Batch extract multiple PDFs
111
+ # paths = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
112
+ # results = Kreuzberg.batch_extract_files_sync(paths)
113
+ # results.each_with_index do |result, idx|
114
+ # puts "File #{idx}: #{result.content.length} characters"
115
+ # end
116
+ #
117
+ # @example Batch extract with consistent configuration
118
+ # paths = Dir.glob("documents/*.pdf")
119
+ # config = Kreuzberg::Config::Extraction.new(force_ocr: true)
120
+ # results = Kreuzberg.batch_extract_files_sync(paths, config: config)
26
121
  def batch_extract_files_sync(paths, config: nil)
27
122
  opts = normalize_config(config)
28
123
  hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
@@ -31,6 +126,36 @@ module Kreuzberg
31
126
  results
32
127
  end
33
128
 
129
+ # Asynchronously extract content from a file.
130
+ #
131
+ # Non-blocking extraction that returns a {Result} promise. Extraction is performed
132
+ # in the background using native threads or the Tokio runtime. This method is
133
+ # preferred for I/O-bound operations and integrating with async workflows.
134
+ #
135
+ # @param path [String, Pathname] Path to the document file to extract
136
+ # @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
137
+ # If omitted, type is detected from file extension.
138
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
139
+ # either a {Config::Extraction} object or a configuration hash.
140
+ #
141
+ # @return [Result] Extraction result containing content, metadata, tables, and images.
142
+ # In async contexts, this result is available upon method return.
143
+ #
144
+ # @raise [Errors::IOError] If the file cannot be read or access is denied
145
+ # @raise [Errors::ParsingError] If document parsing fails
146
+ # @raise [Errors::UnsupportedFormatError] If the file format is not supported
147
+ # @raise [Errors::OCRError] If OCR is enabled and fails
148
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
149
+ #
150
+ # @example Extract a PDF file asynchronously
151
+ # result = Kreuzberg.extract_file("large_document.pdf")
152
+ # puts result.content
153
+ #
154
+ # @example Extract with custom OCR configuration
155
+ # config = Kreuzberg::Config::Extraction.new(
156
+ # ocr: Kreuzberg::Config::OCR.new(language: "deu")
157
+ # )
158
+ # result = Kreuzberg.extract_file("document.pdf", config: config)
34
159
  def extract_file(path, mime_type: nil, config: nil)
35
160
  opts = normalize_config(config)
36
161
  hash = if mime_type
@@ -43,6 +168,36 @@ module Kreuzberg
43
168
  result
44
169
  end
45
170
 
171
+ # Asynchronously extract content from byte data.
172
+ #
173
+ # Non-blocking extraction from in-memory binary data. Like {#extract_file},
174
+ # this performs extraction in the background, making it suitable for handling
175
+ # high-volume extraction workloads without blocking the main thread.
176
+ #
177
+ # @param data [String] Binary document data (can contain any byte values)
178
+ # @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
179
+ # This parameter is mandatory to guide the extraction engine.
180
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
181
+ # either a {Config::Extraction} object or a configuration hash.
182
+ #
183
+ # @return [Result] Extraction result containing content, metadata, tables, and images
184
+ #
185
+ # @raise [Errors::ParsingError] If document parsing fails
186
+ # @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
187
+ # @raise [Errors::OCRError] If OCR is enabled and fails
188
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
189
+ #
190
+ # @example Extract PDF from memory asynchronously
191
+ # pdf_data = File.read("document.pdf", binmode: true)
192
+ # result = Kreuzberg.extract_bytes(pdf_data, "application/pdf")
193
+ # puts result.content
194
+ #
195
+ # @example Extract with image extraction
196
+ # data = File.read("file.docx", binmode: true)
197
+ # config = Kreuzberg::Config::Extraction.new(
198
+ # image_extraction: Kreuzberg::Config::ImageExtraction.new(extract_images: true)
199
+ # )
200
+ # result = Kreuzberg.extract_bytes(data, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", config: config)
46
201
  def extract_bytes(data, mime_type, config: nil)
47
202
  opts = normalize_config(config)
48
203
  hash = native_extract_bytes(data.to_s, mime_type.to_s, **opts)
@@ -51,6 +206,39 @@ module Kreuzberg
51
206
  result
52
207
  end
53
208
 
209
+ # Asynchronously extract content from multiple files.
210
+ #
211
+ # Non-blocking batch extraction from multiple files. Results maintain the same order
212
+ # as input paths. This is the preferred method for bulk processing when non-blocking
213
+ # I/O is required (e.g., in web servers or async applications).
214
+ #
215
+ # @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
216
+ # is converted to a string and MIME type is auto-detected from extension.
217
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
218
+ # Accepts either a {Config::Extraction} object or a configuration hash.
219
+ #
220
+ # @return [Array<Result>] Array of extraction results in the same order as input paths.
221
+ # Array length matches the input paths length.
222
+ #
223
+ # @raise [Errors::IOError] If any file cannot be read
224
+ # @raise [Errors::ParsingError] If any document parsing fails
225
+ # @raise [Errors::UnsupportedFormatError] If any file format is not supported
226
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
227
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
228
+ #
229
+ # @example Batch extract multiple files asynchronously
230
+ # paths = ["invoice_1.pdf", "invoice_2.pdf", "invoice_3.pdf"]
231
+ # results = Kreuzberg.batch_extract_files(paths)
232
+ # results.each_with_index do |result, idx|
233
+ # puts "Invoice #{idx}: #{result.detected_languages}"
234
+ # end
235
+ #
236
+ # @example Batch extract with chunking
237
+ # paths = Dir.glob("reports/*.docx")
238
+ # config = Kreuzberg::Config::Extraction.new(
239
+ # chunking: Kreuzberg::Config::Chunking.new(max_chars: 1000, max_overlap: 200)
240
+ # )
241
+ # results = Kreuzberg.batch_extract_files(paths, config: config)
54
242
  def batch_extract_files(paths, config: nil)
55
243
  opts = normalize_config(config)
56
244
  hashes = native_batch_extract_files(paths.map(&:to_s), **opts)
@@ -59,6 +247,37 @@ module Kreuzberg
59
247
  results
60
248
  end
61
249
 
250
+ # Synchronously extract content from multiple byte data sources.
251
+ #
252
+ # Processes multiple in-memory binary documents in a single batch operation. Results
253
+ # maintain the same order as the input data array. The mime_types array must have
254
+ # the same length as the data_array.
255
+ #
256
+ # @param data_array [Array<String>] Array of binary document data. Each element can
257
+ # contain any byte values (e.g., PDF binary data).
258
+ # @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
259
+ # Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
260
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
261
+ # Accepts either a {Config::Extraction} object or a configuration hash.
262
+ #
263
+ # @return [Array<Result>] Array of extraction results in the same order as input data.
264
+ # Array length matches the data_array length.
265
+ #
266
+ # @raise [ArgumentError] If data_array and mime_types have different lengths
267
+ # @raise [Errors::ParsingError] If any document parsing fails
268
+ # @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
269
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
270
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
271
+ #
272
+ # @example Batch extract binary documents
273
+ # pdf_data_1 = File.read("doc1.pdf", binmode: true)
274
+ # pdf_data_2 = File.read("doc2.pdf", binmode: true)
275
+ # docx_data = File.read("report.docx", binmode: true)
276
+ #
277
+ # data = [pdf_data_1, pdf_data_2, docx_data]
278
+ # types = ["application/pdf", "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
279
+ # results = Kreuzberg.batch_extract_bytes_sync(data, types)
280
+ # results.each { |r| puts r.content }
62
281
  def batch_extract_bytes_sync(data_array, mime_types, config: nil)
63
282
  opts = normalize_config(config)
64
283
  hashes = native_batch_extract_bytes_sync(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
@@ -67,6 +286,42 @@ module Kreuzberg
67
286
  results
68
287
  end
69
288
 
289
+ # Asynchronously extract content from multiple byte data sources.
290
+ #
291
+ # Non-blocking batch extraction from multiple in-memory binary documents. Results
292
+ # maintain the same order as the input data array. This method is preferred when
293
+ # processing multiple documents without blocking (e.g., handling multiple uploads).
294
+ #
295
+ # @param data_array [Array<String>] Array of binary document data. Each element can
296
+ # contain any byte values (e.g., PDF binary data).
297
+ # @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
298
+ # Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
299
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
300
+ # Accepts either a {Config::Extraction} object or a configuration hash.
301
+ #
302
+ # @return [Array<Result>] Array of extraction results in the same order as input data.
303
+ # Array length matches the data_array length.
304
+ #
305
+ # @raise [ArgumentError] If data_array and mime_types have different lengths
306
+ # @raise [Errors::ParsingError] If any document parsing fails
307
+ # @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
308
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
309
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
310
+ #
311
+ # @example Batch extract uploaded documents asynchronously
312
+ # # From a web request with multiple file uploads
313
+ # uploaded_files = params[:files] # Array of uploaded file objects
314
+ # data = uploaded_files.map(&:read)
315
+ # types = uploaded_files.map(&:content_type)
316
+ #
317
+ # results = Kreuzberg.batch_extract_bytes(data, types)
318
+ # results.each { |r| puts r.content }
319
+ #
320
+ # @example Batch extract with OCR
321
+ # data = [scan_1_bytes, scan_2_bytes, scan_3_bytes]
322
+ # types = ["image/png", "image/png", "image/png"]
323
+ # config = Kreuzberg::Config::Extraction.new(force_ocr: true)
324
+ # results = Kreuzberg.batch_extract_bytes(data, types, config: config)
70
325
  def batch_extract_bytes(data_array, mime_types, config: nil)
71
326
  opts = normalize_config(config)
72
327
  hashes = native_batch_extract_bytes(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.0-rc.8'
4
+ VERSION = '4.0.0-rc.11'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -23,6 +23,12 @@ module Kreuzberg
23
23
 
24
24
  # Alias for API consistency with other language bindings
25
25
  ExtractionConfig = Config::Extraction
26
+ PageConfig = Config::PageConfig
27
+
28
+ module KeywordAlgorithm
29
+ YAKE = :yake
30
+ RAKE = :rake
31
+ end
26
32
 
27
33
  @__cache_tracker = { entries: 0, bytes: 0 }
28
34
 
Binary file