kreuzberg 4.0.0.pre.rc.7 → 4.0.0.pre.rc.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +13 -12
- data/README.md +22 -0
- data/ext/kreuzberg_rb/native/.cargo/config.toml +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +397 -183
- data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
- data/kreuzberg.gemspec +34 -2
- data/lib/kreuzberg/cache_api.rb +35 -0
- data/lib/kreuzberg/error_context.rb +49 -1
- data/lib/kreuzberg/extraction_api.rb +255 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +6 -0
- data/lib/libpdfium.dylib +0 -0
- data/sig/kreuzberg.rbs +9 -0
- data/vendor/Cargo.toml +44 -0
- data/vendor/kreuzberg/Cargo.toml +65 -35
- data/vendor/kreuzberg/README.md +50 -0
- data/vendor/kreuzberg/build.rs +548 -190
- data/vendor/kreuzberg/src/api/mod.rs +0 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
- data/vendor/kreuzberg/src/embeddings.rs +71 -3
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/html.rs +37 -5
- data/vendor/kreuzberg/src/extractors/pdf.rs +99 -47
- data/vendor/kreuzberg/src/mcp/mod.rs +3 -2
- data/vendor/kreuzberg/src/mcp/server.rs +106 -0
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
- data/vendor/kreuzberg/src/pdf/mod.rs +6 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
- data/vendor/kreuzberg/src/pdf/table.rs +3 -0
- data/vendor/kreuzberg/src/pdf/text.rs +2 -2
- data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
- data/vendor/kreuzberg/tests/format_integration.rs +4 -1
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/build.rs +176 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
- data/vendor/kreuzberg-tesseract/LICENSE +22 -0
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1354 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- data/vendor/rb-sys/src/lib.rs +1 -0
- metadata +41 -3
- data/vendor/rb-sys/bin/release.sh +0 -22
|
@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
|
|
|
7
7
|
|
|
8
8
|
[package]
|
|
9
9
|
name = "kreuzberg-rb"
|
|
10
|
-
version = "4.0.0-rc.
|
|
10
|
+
version = "4.0.0-rc.11"
|
|
11
11
|
edition = "2024"
|
|
12
12
|
rust-version = "1.91"
|
|
13
13
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -29,7 +29,7 @@ default = []
|
|
|
29
29
|
|
|
30
30
|
[dependencies]
|
|
31
31
|
async-trait = "0.1.89"
|
|
32
|
-
kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full"
|
|
32
|
+
kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full"] }
|
|
33
33
|
kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi", features = ["embeddings"] }
|
|
34
34
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
35
35
|
"rb-sys",
|
|
@@ -48,7 +48,7 @@ tokio = { version = "1.48.0", features = [
|
|
|
48
48
|
"time",
|
|
49
49
|
"io-util",
|
|
50
50
|
] }
|
|
51
|
-
html-to-markdown-rs = { version = "2.14.
|
|
51
|
+
html-to-markdown-rs = { version = "2.14.2", default-features = false }
|
|
52
52
|
|
|
53
53
|
[dev-dependencies]
|
|
54
54
|
pretty_assertions = "1.4"
|
|
@@ -23,7 +23,9 @@ use kreuzberg::{
|
|
|
23
23
|
use magnus::exception::ExceptionClass;
|
|
24
24
|
use magnus::r_hash::ForEach;
|
|
25
25
|
use magnus::value::ReprValue;
|
|
26
|
-
use magnus::{
|
|
26
|
+
use magnus::{
|
|
27
|
+
Error, IntoValue, RArray, RHash, RString, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args,
|
|
28
|
+
};
|
|
27
29
|
use std::fs;
|
|
28
30
|
use std::path::{Path, PathBuf};
|
|
29
31
|
|
|
@@ -52,10 +54,15 @@ impl Drop for GcGuardedValue {
|
|
|
52
54
|
}
|
|
53
55
|
}
|
|
54
56
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
use std::ffi::c_char;
|
|
58
|
+
|
|
59
|
+
// These C ABI functions are provided by the kreuzberg-ffi crate
|
|
60
|
+
// We declare them here to ensure proper linking on all platforms
|
|
61
|
+
#[link(name = "kreuzberg_ffi", kind = "static")]
|
|
62
|
+
extern "C" {
|
|
63
|
+
pub fn kreuzberg_last_error_code() -> i32;
|
|
64
|
+
pub fn kreuzberg_last_panic_context() -> *mut c_char;
|
|
65
|
+
pub fn kreuzberg_free_string(s: *mut c_char);
|
|
59
66
|
}
|
|
60
67
|
|
|
61
68
|
/// Retrieve panic context from FFI if available
|
|
@@ -1797,13 +1804,16 @@ fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
|
1797
1804
|
///
|
|
1798
1805
|
fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
1799
1806
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1800
|
-
let args = scan_args::<(
|
|
1807
|
+
let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
|
|
1801
1808
|
let (data, mime_type) = args.required;
|
|
1802
1809
|
let opts = Some(args.keywords);
|
|
1803
1810
|
|
|
1804
1811
|
let config = parse_extraction_config(&ruby, opts)?;
|
|
1805
1812
|
|
|
1806
|
-
|
|
1813
|
+
// SAFETY: we hold `data` for the duration of the call and do not re-enter Ruby while
|
|
1814
|
+
// borrowing its bytes, so Ruby cannot mutate/free this string during extraction.
|
|
1815
|
+
let bytes = unsafe { data.as_slice() };
|
|
1816
|
+
let result = kreuzberg::extract_bytes_sync(bytes, &mime_type, &config).map_err(kreuzberg_error)?;
|
|
1807
1817
|
|
|
1808
1818
|
extraction_result_to_ruby(&ruby, result)
|
|
1809
1819
|
}
|
|
@@ -1877,7 +1887,7 @@ fn extract_file(args: &[Value]) -> Result<RHash, Error> {
|
|
|
1877
1887
|
///
|
|
1878
1888
|
fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
|
|
1879
1889
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1880
|
-
let args = scan_args::<(
|
|
1890
|
+
let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
|
|
1881
1891
|
let (data, mime_type) = args.required;
|
|
1882
1892
|
let opts = Some(args.keywords);
|
|
1883
1893
|
|
|
@@ -1886,8 +1896,11 @@ fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
|
|
|
1886
1896
|
let runtime =
|
|
1887
1897
|
tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
1888
1898
|
|
|
1899
|
+
// SAFETY: we hold `data` for the duration of the call and do not re-enter Ruby while
|
|
1900
|
+
// borrowing its bytes, so Ruby cannot mutate/free this string during extraction.
|
|
1901
|
+
let bytes = unsafe { data.as_slice() };
|
|
1889
1902
|
let result = runtime
|
|
1890
|
-
.block_on(async { kreuzberg::extract_bytes(
|
|
1903
|
+
.block_on(async { kreuzberg::extract_bytes(bytes, &mime_type, &config).await })
|
|
1891
1904
|
.map_err(kreuzberg_error)?;
|
|
1892
1905
|
|
|
1893
1906
|
extraction_result_to_ruby(&ruby, result)
|
|
@@ -1944,7 +1957,10 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
|
|
|
1944
1957
|
|
|
1945
1958
|
let config = parse_extraction_config(&ruby, opts)?;
|
|
1946
1959
|
|
|
1947
|
-
let bytes_vec: Vec<
|
|
1960
|
+
let bytes_vec: Vec<RString> = bytes_array
|
|
1961
|
+
.into_iter()
|
|
1962
|
+
.map(RString::try_convert)
|
|
1963
|
+
.collect::<Result<_, _>>()?;
|
|
1948
1964
|
let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
|
|
1949
1965
|
|
|
1950
1966
|
if bytes_vec.len() != mime_types.len() {
|
|
@@ -1955,10 +1971,12 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
|
|
|
1955
1971
|
)));
|
|
1956
1972
|
}
|
|
1957
1973
|
|
|
1974
|
+
// SAFETY: we hold `bytes_vec` for the duration of the call and do not re-enter Ruby while
|
|
1975
|
+
// borrowing its bytes, so Ruby cannot mutate/free these strings during extraction.
|
|
1958
1976
|
let contents: Vec<(&[u8], &str)> = bytes_vec
|
|
1959
1977
|
.iter()
|
|
1960
1978
|
.zip(mime_types.iter())
|
|
1961
|
-
.map(|(bytes, mime)| (bytes.
|
|
1979
|
+
.map(|(bytes, mime)| (unsafe { bytes.as_slice() }, mime.as_str()))
|
|
1962
1980
|
.collect();
|
|
1963
1981
|
|
|
1964
1982
|
let results = kreuzberg::batch_extract_bytes_sync(contents, &config).map_err(kreuzberg_error)?;
|
|
@@ -1986,7 +2004,10 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
|
|
|
1986
2004
|
|
|
1987
2005
|
let config = parse_extraction_config(&ruby, opts)?;
|
|
1988
2006
|
|
|
1989
|
-
let bytes_vec: Vec<
|
|
2007
|
+
let bytes_vec: Vec<RString> = bytes_array
|
|
2008
|
+
.into_iter()
|
|
2009
|
+
.map(RString::try_convert)
|
|
2010
|
+
.collect::<Result<_, _>>()?;
|
|
1990
2011
|
let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
|
|
1991
2012
|
|
|
1992
2013
|
if bytes_vec.len() != mime_types.len() {
|
|
@@ -1997,10 +2018,12 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
|
|
|
1997
2018
|
)));
|
|
1998
2019
|
}
|
|
1999
2020
|
|
|
2021
|
+
// SAFETY: we hold `bytes_vec` for the duration of the call and do not re-enter Ruby while
|
|
2022
|
+
// borrowing its bytes, so Ruby cannot mutate/free these strings during extraction.
|
|
2000
2023
|
let contents: Vec<(&[u8], &str)> = bytes_vec
|
|
2001
2024
|
.iter()
|
|
2002
2025
|
.zip(mime_types.iter())
|
|
2003
|
-
.map(|(bytes, mime)| (bytes.
|
|
2026
|
+
.map(|(bytes, mime)| (unsafe { bytes.as_slice() }, mime.as_str()))
|
|
2004
2027
|
.collect();
|
|
2005
2028
|
|
|
2006
2029
|
let runtime =
|
data/kreuzberg.gemspec
CHANGED
|
@@ -71,7 +71,16 @@ fallback_files = Dir.chdir(__dir__) do
|
|
|
71
71
|
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
72
72
|
end
|
|
73
73
|
|
|
74
|
-
|
|
74
|
+
tesseract_fallback = Dir.chdir(repo_root) do
|
|
75
|
+
Dir.glob('crates/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
|
|
76
|
+
.reject { |f| File.directory?(f) }
|
|
77
|
+
.reject { |f| f.include?('/target/') }
|
|
78
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
79
|
+
.grep_v(/~$/)
|
|
80
|
+
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
ruby_fallback + core_fallback + ffi_fallback + tesseract_fallback
|
|
75
84
|
end
|
|
76
85
|
|
|
77
86
|
# Check for vendored crates (copied during CI/packaging)
|
|
@@ -98,6 +107,16 @@ vendor_files = Dir.chdir(__dir__) do
|
|
|
98
107
|
[]
|
|
99
108
|
end
|
|
100
109
|
|
|
110
|
+
kreuzberg_tesseract_files = if Dir.exist?('vendor/kreuzberg-tesseract')
|
|
111
|
+
Dir.glob('vendor/kreuzberg-tesseract/**/*', File::FNM_DOTMATCH)
|
|
112
|
+
.reject { |f| File.directory?(f) }
|
|
113
|
+
.reject { |f| f.include?('/target/') }
|
|
114
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
115
|
+
.grep_v(/~$/)
|
|
116
|
+
else
|
|
117
|
+
[]
|
|
118
|
+
end
|
|
119
|
+
|
|
101
120
|
rb_sys_files = if Dir.exist?('vendor/rb-sys')
|
|
102
121
|
Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
|
|
103
122
|
.reject { |f| File.directory?(f) }
|
|
@@ -114,7 +133,7 @@ vendor_files = Dir.chdir(__dir__) do
|
|
|
114
133
|
[]
|
|
115
134
|
end
|
|
116
135
|
|
|
117
|
-
kreuzberg_files + kreuzberg_ffi_files + rb_sys_files + workspace_toml
|
|
136
|
+
kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files + rb_sys_files + workspace_toml
|
|
118
137
|
end
|
|
119
138
|
|
|
120
139
|
# Use git-tracked files if available, otherwise fallback to glob
|
|
@@ -127,8 +146,21 @@ files = if (ruby_files + core_files + ffi_files).empty?
|
|
|
127
146
|
ruby_files + core_files + ffi_files
|
|
128
147
|
end
|
|
129
148
|
|
|
149
|
+
# Include built native artifacts when present (untracked by git)
|
|
150
|
+
# This enables shipping precompiled gems from CI without committing binaries.
|
|
151
|
+
native_artifacts = Dir.chdir(__dir__) do
|
|
152
|
+
Dir.glob(%w[
|
|
153
|
+
lib/**/*.bundle
|
|
154
|
+
lib/**/*.so
|
|
155
|
+
lib/**/*.dll
|
|
156
|
+
lib/**/*.dylib
|
|
157
|
+
])
|
|
158
|
+
end
|
|
159
|
+
files.concat(native_artifacts)
|
|
160
|
+
|
|
130
161
|
# Filter to only include files that actually exist
|
|
131
162
|
files = files.select { |f| File.exist?(f) }
|
|
163
|
+
files = files.uniq
|
|
132
164
|
|
|
133
165
|
Gem::Specification.new do |spec|
|
|
134
166
|
spec.name = 'kreuzberg'
|
data/lib/kreuzberg/cache_api.rb
CHANGED
|
@@ -2,12 +2,47 @@
|
|
|
2
2
|
|
|
3
3
|
module Kreuzberg
|
|
4
4
|
# Provides caching capabilities for extraction results.
|
|
5
|
+
#
|
|
6
|
+
# This module manages the cache for document extraction results. Results are cached
|
|
7
|
+
# based on document content, configuration, and MIME type, improving performance for
|
|
8
|
+
# repeated extractions of the same documents.
|
|
5
9
|
module CacheAPI
|
|
10
|
+
# Clear all cached extraction results.
|
|
11
|
+
#
|
|
12
|
+
# Removes all entries from both the native Rust cache and the local tracking state.
|
|
13
|
+
# After calling this method, all extraction results will be recomputed on subsequent
|
|
14
|
+
# requests (unless caching is disabled).
|
|
15
|
+
#
|
|
16
|
+
# @return [void] No meaningful return value
|
|
17
|
+
#
|
|
18
|
+
# @example Clear cache
|
|
19
|
+
# Kreuzberg.clear_cache
|
|
20
|
+
# puts "Cache cleared"
|
|
6
21
|
def clear_cache
|
|
7
22
|
native_clear_cache
|
|
8
23
|
reset_cache_tracker!
|
|
9
24
|
end
|
|
10
25
|
|
|
26
|
+
# Retrieve cache statistics.
|
|
27
|
+
#
|
|
28
|
+
# Returns information about the current state of the extraction result cache,
|
|
29
|
+
# including the number of cached entries and total memory used. Statistics include
|
|
30
|
+
# both native Rust cache metrics and local tracker metrics.
|
|
31
|
+
#
|
|
32
|
+
# @return [Hash{Symbol | String => Integer}] Cache statistics hash containing:
|
|
33
|
+
# - :total_entries [Integer] Total number of cached extraction results
|
|
34
|
+
# - :total_size_bytes [Integer] Total memory used by cached results in bytes
|
|
35
|
+
#
|
|
36
|
+
# @example Get cache statistics
|
|
37
|
+
# stats = Kreuzberg.cache_stats
|
|
38
|
+
# puts "Cached entries: #{stats[:total_entries]}"
|
|
39
|
+
# puts "Cache size: #{stats[:total_size_bytes]} bytes"
|
|
40
|
+
#
|
|
41
|
+
# @example Check if cache is full
|
|
42
|
+
# stats = Kreuzberg.cache_stats
|
|
43
|
+
# if stats[:total_size_bytes] > 1_000_000_000 # 1GB
|
|
44
|
+
# Kreuzberg.clear_cache
|
|
45
|
+
# end
|
|
11
46
|
def cache_stats
|
|
12
47
|
stats = native_cache_stats
|
|
13
48
|
total_entries = (stats['total_entries'] || stats[:total_entries] || 0) + @__cache_tracker[:entries]
|
|
@@ -4,15 +4,50 @@ require 'json'
|
|
|
4
4
|
|
|
5
5
|
module Kreuzberg
|
|
6
6
|
# ErrorContext module provides access to FFI error introspection functions.
|
|
7
|
-
#
|
|
7
|
+
#
|
|
8
|
+
# This module retrieves detailed error and panic context information from the native
|
|
9
|
+
# Rust core. It allows inspection of the last error that occurred during extraction,
|
|
10
|
+
# including panic information with file, line, function, and timestamp details.
|
|
8
11
|
module ErrorContext
|
|
9
12
|
class << self
|
|
13
|
+
# Get the error code of the last operation.
|
|
14
|
+
#
|
|
15
|
+
# Returns the error code from the last FFI call. Returns 0 (SUCCESS) if no error
|
|
16
|
+
# occurred or if introspection fails.
|
|
17
|
+
#
|
|
18
|
+
# @return [Integer] Error code constant (ERROR_CODE_* values), or 0 on success
|
|
19
|
+
#
|
|
20
|
+
# @example Check last error
|
|
21
|
+
# code = Kreuzberg::ErrorContext.last_error_code
|
|
22
|
+
# case code
|
|
23
|
+
# when Kreuzberg::ERROR_CODE_IO
|
|
24
|
+
# puts "I/O error occurred"
|
|
25
|
+
# when Kreuzberg::ERROR_CODE_PARSING
|
|
26
|
+
# puts "Parsing error occurred"
|
|
27
|
+
# else
|
|
28
|
+
# puts "Success or unknown error"
|
|
29
|
+
# end
|
|
10
30
|
def last_error_code
|
|
11
31
|
Kreuzberg._last_error_code_native
|
|
12
32
|
rescue StandardError
|
|
13
33
|
0
|
|
14
34
|
end
|
|
15
35
|
|
|
36
|
+
# Get panic context information from the last error.
|
|
37
|
+
#
|
|
38
|
+
# Returns a {Errors::PanicContext} object containing detailed information about
|
|
39
|
+
# the last panic that occurred in the Rust core. Includes file path, line number,
|
|
40
|
+
# function name, error message, and timestamp.
|
|
41
|
+
#
|
|
42
|
+
# @return [Errors::PanicContext, nil] Panic context if a panic occurred, nil otherwise
|
|
43
|
+
#
|
|
44
|
+
# @example Get panic details
|
|
45
|
+
# panic = Kreuzberg::ErrorContext.last_panic_context
|
|
46
|
+
# if panic
|
|
47
|
+
# puts "Panic at #{panic.file}:#{panic.line} in #{panic.function}"
|
|
48
|
+
# puts "Message: #{panic.message}"
|
|
49
|
+
# puts "Time: #{panic.timestamp_secs}"
|
|
50
|
+
# end
|
|
16
51
|
def last_panic_context
|
|
17
52
|
json_str = Kreuzberg._last_panic_context_json_native
|
|
18
53
|
return nil unless json_str
|
|
@@ -22,6 +57,19 @@ module Kreuzberg
|
|
|
22
57
|
nil
|
|
23
58
|
end
|
|
24
59
|
|
|
60
|
+
# Get panic context as raw JSON string.
|
|
61
|
+
#
|
|
62
|
+
# Returns the panic context information as a JSON string for raw access or
|
|
63
|
+
# custom parsing. Returns nil if no panic has occurred.
|
|
64
|
+
#
|
|
65
|
+
# @return [String, nil] JSON-serialized panic context, or nil if no panic
|
|
66
|
+
#
|
|
67
|
+
# @example Get raw JSON panic context
|
|
68
|
+
# json = Kreuzberg::ErrorContext.last_panic_context_json
|
|
69
|
+
# if json
|
|
70
|
+
# panic_data = JSON.parse(json)
|
|
71
|
+
# puts panic_data
|
|
72
|
+
# end
|
|
25
73
|
def last_panic_context_json
|
|
26
74
|
Kreuzberg._last_panic_context_json_native
|
|
27
75
|
rescue StandardError
|
|
@@ -2,7 +2,45 @@
|
|
|
2
2
|
|
|
3
3
|
module Kreuzberg
|
|
4
4
|
# Provides extraction methods for documents and text.
|
|
5
|
+
#
|
|
6
|
+
# This module includes both synchronous and asynchronous methods for extracting
|
|
7
|
+
# content from files and byte data. Results are automatically cached based on
|
|
8
|
+
# configuration settings.
|
|
5
9
|
module ExtractionAPI
|
|
10
|
+
# Synchronously extract content from a file.
|
|
11
|
+
#
|
|
12
|
+
# Performs document extraction including text, tables, metadata, and optionally
|
|
13
|
+
# images. Supports various file formats (PDF, DOCX, XLSX, images, HTML, etc.)
|
|
14
|
+
# based on the detected or specified MIME type.
|
|
15
|
+
#
|
|
16
|
+
# @param path [String, Pathname] Path to the document file to extract
|
|
17
|
+
# @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
|
|
18
|
+
# If omitted, type is detected from file extension.
|
|
19
|
+
# @param config [Config::Extraction, Hash, nil] Extraction configuration controlling
|
|
20
|
+
# OCR settings, chunking, image extraction, and more. Accepts either a {Config::Extraction}
|
|
21
|
+
# object or a configuration hash.
|
|
22
|
+
#
|
|
23
|
+
# @return [Result] Extraction result containing content, metadata, tables, and images
|
|
24
|
+
#
|
|
25
|
+
# @raise [Errors::IOError] If the file cannot be read or access is denied
|
|
26
|
+
# @raise [Errors::ParsingError] If document parsing fails
|
|
27
|
+
# @raise [Errors::UnsupportedFormatError] If the file format is not supported
|
|
28
|
+
# @raise [Errors::OCRError] If OCR is enabled and fails
|
|
29
|
+
# @raise [Errors::MissingDependencyError] If a required dependency is missing
|
|
30
|
+
#
|
|
31
|
+
# @example Extract a PDF file
|
|
32
|
+
# result = Kreuzberg.extract_file_sync("document.pdf")
|
|
33
|
+
# puts result.content
|
|
34
|
+
#
|
|
35
|
+
# @example Extract with explicit MIME type
|
|
36
|
+
# result = Kreuzberg.extract_file_sync("data.bin", mime_type: "application/pdf")
|
|
37
|
+
#
|
|
38
|
+
# @example Extract with OCR enabled
|
|
39
|
+
# config = Kreuzberg::Config::Extraction.new(
|
|
40
|
+
# force_ocr: true,
|
|
41
|
+
# ocr: Kreuzberg::Config::OCR.new(language: "eng")
|
|
42
|
+
# )
|
|
43
|
+
# result = Kreuzberg.extract_file_sync("scanned.pdf", config: config)
|
|
6
44
|
def extract_file_sync(path, mime_type: nil, config: nil)
|
|
7
45
|
opts = normalize_config(config)
|
|
8
46
|
hash = if mime_type
|
|
@@ -15,6 +53,32 @@ module Kreuzberg
|
|
|
15
53
|
result
|
|
16
54
|
end
|
|
17
55
|
|
|
56
|
+
# Synchronously extract content from byte data.
|
|
57
|
+
#
|
|
58
|
+
# Performs document extraction directly from binary data in memory. Useful for
|
|
59
|
+
# extracting content from files already loaded into memory or from network streams.
|
|
60
|
+
#
|
|
61
|
+
# @param data [String] Binary document data (can contain any byte values)
|
|
62
|
+
# @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
|
|
63
|
+
# This parameter is mandatory to guide the extraction engine.
|
|
64
|
+
# @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
|
|
65
|
+
# either a {Config::Extraction} object or a configuration hash.
|
|
66
|
+
#
|
|
67
|
+
# @return [Result] Extraction result containing content, metadata, tables, and images
|
|
68
|
+
#
|
|
69
|
+
# @raise [Errors::ParsingError] If document parsing fails
|
|
70
|
+
# @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
|
|
71
|
+
# @raise [Errors::OCRError] If OCR is enabled and fails
|
|
72
|
+
# @raise [Errors::MissingDependencyError] If a required dependency is missing
|
|
73
|
+
#
|
|
74
|
+
# @example Extract PDF from memory
|
|
75
|
+
# pdf_data = File.read("document.pdf", binmode: true)
|
|
76
|
+
# result = Kreuzberg.extract_bytes_sync(pdf_data, "application/pdf")
|
|
77
|
+
# puts result.content
|
|
78
|
+
#
|
|
79
|
+
# @example Extract from a network stream
|
|
80
|
+
# response = HTTParty.get("https://example.com/document.docx")
|
|
81
|
+
# result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
|
18
82
|
def extract_bytes_sync(data, mime_type, config: nil)
|
|
19
83
|
opts = normalize_config(config)
|
|
20
84
|
hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
|
|
@@ -23,6 +87,37 @@ module Kreuzberg
|
|
|
23
87
|
result
|
|
24
88
|
end
|
|
25
89
|
|
|
90
|
+
# Synchronously extract content from multiple files.
|
|
91
|
+
#
|
|
92
|
+
# Processes multiple files in a single batch operation. Files are extracted sequentially,
|
|
93
|
+
# and results maintain the same order as the input paths. This is useful for bulk
|
|
94
|
+
# processing multiple documents with consistent configuration.
|
|
95
|
+
#
|
|
96
|
+
# @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
|
|
97
|
+
# is converted to a string and MIME type is auto-detected from extension.
|
|
98
|
+
# @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
|
|
99
|
+
# Accepts either a {Config::Extraction} object or a configuration hash.
|
|
100
|
+
#
|
|
101
|
+
# @return [Array<Result>] Array of extraction results in the same order as input paths.
|
|
102
|
+
# Array length matches the input paths length.
|
|
103
|
+
#
|
|
104
|
+
# @raise [Errors::IOError] If any file cannot be read
|
|
105
|
+
# @raise [Errors::ParsingError] If any document parsing fails
|
|
106
|
+
# @raise [Errors::UnsupportedFormatError] If any file format is not supported
|
|
107
|
+
# @raise [Errors::OCRError] If OCR is enabled and fails on any document
|
|
108
|
+
# @raise [Errors::MissingDependencyError] If a required dependency is missing
|
|
109
|
+
#
|
|
110
|
+
# @example Batch extract multiple PDFs
|
|
111
|
+
# paths = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
|
|
112
|
+
# results = Kreuzberg.batch_extract_files_sync(paths)
|
|
113
|
+
# results.each_with_index do |result, idx|
|
|
114
|
+
# puts "File #{idx}: #{result.content.length} characters"
|
|
115
|
+
# end
|
|
116
|
+
#
|
|
117
|
+
# @example Batch extract with consistent configuration
|
|
118
|
+
# paths = Dir.glob("documents/*.pdf")
|
|
119
|
+
# config = Kreuzberg::Config::Extraction.new(force_ocr: true)
|
|
120
|
+
# results = Kreuzberg.batch_extract_files_sync(paths, config: config)
|
|
26
121
|
def batch_extract_files_sync(paths, config: nil)
|
|
27
122
|
opts = normalize_config(config)
|
|
28
123
|
hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
|
|
@@ -31,6 +126,36 @@ module Kreuzberg
|
|
|
31
126
|
results
|
|
32
127
|
end
|
|
33
128
|
|
|
129
|
+
# Asynchronously extract content from a file.
|
|
130
|
+
#
|
|
131
|
+
# Non-blocking extraction that returns a {Result} promise. Extraction is performed
|
|
132
|
+
# in the background using native threads or the Tokio runtime. This method is
|
|
133
|
+
# preferred for I/O-bound operations and integrating with async workflows.
|
|
134
|
+
#
|
|
135
|
+
# @param path [String, Pathname] Path to the document file to extract
|
|
136
|
+
# @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
|
|
137
|
+
# If omitted, type is detected from file extension.
|
|
138
|
+
# @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
|
|
139
|
+
# either a {Config::Extraction} object or a configuration hash.
|
|
140
|
+
#
|
|
141
|
+
# @return [Result] Extraction result containing content, metadata, tables, and images.
|
|
142
|
+
# In async contexts, this result is available upon method return.
|
|
143
|
+
#
|
|
144
|
+
# @raise [Errors::IOError] If the file cannot be read or access is denied
|
|
145
|
+
# @raise [Errors::ParsingError] If document parsing fails
|
|
146
|
+
# @raise [Errors::UnsupportedFormatError] If the file format is not supported
|
|
147
|
+
# @raise [Errors::OCRError] If OCR is enabled and fails
|
|
148
|
+
# @raise [Errors::MissingDependencyError] If a required dependency is missing
|
|
149
|
+
#
|
|
150
|
+
# @example Extract a PDF file asynchronously
|
|
151
|
+
# result = Kreuzberg.extract_file("large_document.pdf")
|
|
152
|
+
# puts result.content
|
|
153
|
+
#
|
|
154
|
+
# @example Extract with custom OCR configuration
|
|
155
|
+
# config = Kreuzberg::Config::Extraction.new(
|
|
156
|
+
# ocr: Kreuzberg::Config::OCR.new(language: "deu")
|
|
157
|
+
# )
|
|
158
|
+
# result = Kreuzberg.extract_file("document.pdf", config: config)
|
|
34
159
|
def extract_file(path, mime_type: nil, config: nil)
|
|
35
160
|
opts = normalize_config(config)
|
|
36
161
|
hash = if mime_type
|
|
@@ -43,6 +168,36 @@ module Kreuzberg
|
|
|
43
168
|
result
|
|
44
169
|
end
|
|
45
170
|
|
|
171
|
+
# Asynchronously extract content from byte data.
|
|
172
|
+
#
|
|
173
|
+
# Non-blocking extraction from in-memory binary data. Like {#extract_file},
|
|
174
|
+
# this performs extraction in the background, making it suitable for handling
|
|
175
|
+
# high-volume extraction workloads without blocking the main thread.
|
|
176
|
+
#
|
|
177
|
+
# @param data [String] Binary document data (can contain any byte values)
|
|
178
|
+
# @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
|
|
179
|
+
# This parameter is mandatory to guide the extraction engine.
|
|
180
|
+
# @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
|
|
181
|
+
# either a {Config::Extraction} object or a configuration hash.
|
|
182
|
+
#
|
|
183
|
+
# @return [Result] Extraction result containing content, metadata, tables, and images
|
|
184
|
+
#
|
|
185
|
+
# @raise [Errors::ParsingError] If document parsing fails
|
|
186
|
+
# @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
|
|
187
|
+
# @raise [Errors::OCRError] If OCR is enabled and fails
|
|
188
|
+
# @raise [Errors::MissingDependencyError] If a required dependency is missing
|
|
189
|
+
#
|
|
190
|
+
# @example Extract PDF from memory asynchronously
|
|
191
|
+
# pdf_data = File.read("document.pdf", binmode: true)
|
|
192
|
+
# result = Kreuzberg.extract_bytes(pdf_data, "application/pdf")
|
|
193
|
+
# puts result.content
|
|
194
|
+
#
|
|
195
|
+
# @example Extract with image extraction
|
|
196
|
+
# data = File.read("file.docx", binmode: true)
|
|
197
|
+
# config = Kreuzberg::Config::Extraction.new(
|
|
198
|
+
# image_extraction: Kreuzberg::Config::ImageExtraction.new(extract_images: true)
|
|
199
|
+
# )
|
|
200
|
+
# result = Kreuzberg.extract_bytes(data, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", config: config)
|
|
46
201
|
def extract_bytes(data, mime_type, config: nil)
|
|
47
202
|
opts = normalize_config(config)
|
|
48
203
|
hash = native_extract_bytes(data.to_s, mime_type.to_s, **opts)
|
|
@@ -51,6 +206,39 @@ module Kreuzberg
|
|
|
51
206
|
result
|
|
52
207
|
end
|
|
53
208
|
|
|
209
|
+
# Asynchronously extract content from multiple files.
|
|
210
|
+
#
|
|
211
|
+
# Non-blocking batch extraction from multiple files. Results maintain the same order
|
|
212
|
+
# as input paths. This is the preferred method for bulk processing when non-blocking
|
|
213
|
+
# I/O is required (e.g., in web servers or async applications).
|
|
214
|
+
#
|
|
215
|
+
# @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
|
|
216
|
+
# is converted to a string and MIME type is auto-detected from extension.
|
|
217
|
+
# @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
|
|
218
|
+
# Accepts either a {Config::Extraction} object or a configuration hash.
|
|
219
|
+
#
|
|
220
|
+
# @return [Array<Result>] Array of extraction results in the same order as input paths.
|
|
221
|
+
# Array length matches the input paths length.
|
|
222
|
+
#
|
|
223
|
+
# @raise [Errors::IOError] If any file cannot be read
|
|
224
|
+
# @raise [Errors::ParsingError] If any document parsing fails
|
|
225
|
+
# @raise [Errors::UnsupportedFormatError] If any file format is not supported
|
|
226
|
+
# @raise [Errors::OCRError] If OCR is enabled and fails on any document
|
|
227
|
+
# @raise [Errors::MissingDependencyError] If a required dependency is missing
|
|
228
|
+
#
|
|
229
|
+
# @example Batch extract multiple files asynchronously
|
|
230
|
+
# paths = ["invoice_1.pdf", "invoice_2.pdf", "invoice_3.pdf"]
|
|
231
|
+
# results = Kreuzberg.batch_extract_files(paths)
|
|
232
|
+
# results.each_with_index do |result, idx|
|
|
233
|
+
# puts "Invoice #{idx}: #{result.detected_languages}"
|
|
234
|
+
# end
|
|
235
|
+
#
|
|
236
|
+
# @example Batch extract with chunking
|
|
237
|
+
# paths = Dir.glob("reports/*.docx")
|
|
238
|
+
# config = Kreuzberg::Config::Extraction.new(
|
|
239
|
+
# chunking: Kreuzberg::Config::Chunking.new(max_chars: 1000, max_overlap: 200)
|
|
240
|
+
# )
|
|
241
|
+
# results = Kreuzberg.batch_extract_files(paths, config: config)
|
|
54
242
|
def batch_extract_files(paths, config: nil)
|
|
55
243
|
opts = normalize_config(config)
|
|
56
244
|
hashes = native_batch_extract_files(paths.map(&:to_s), **opts)
|
|
@@ -59,6 +247,37 @@ module Kreuzberg
|
|
|
59
247
|
results
|
|
60
248
|
end
|
|
61
249
|
|
|
250
|
+
# Synchronously extract content from multiple byte data sources.
|
|
251
|
+
#
|
|
252
|
+
# Processes multiple in-memory binary documents in a single batch operation. Results
|
|
253
|
+
# maintain the same order as the input data array. The mime_types array must have
|
|
254
|
+
# the same length as the data_array.
|
|
255
|
+
#
|
|
256
|
+
# @param data_array [Array<String>] Array of binary document data. Each element can
|
|
257
|
+
# contain any byte values (e.g., PDF binary data).
|
|
258
|
+
# @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
|
|
259
|
+
# Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
|
|
260
|
+
# @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
|
|
261
|
+
# Accepts either a {Config::Extraction} object or a configuration hash.
|
|
262
|
+
#
|
|
263
|
+
# @return [Array<Result>] Array of extraction results in the same order as input data.
|
|
264
|
+
# Array length matches the data_array length.
|
|
265
|
+
#
|
|
266
|
+
# @raise [ArgumentError] If data_array and mime_types have different lengths
|
|
267
|
+
# @raise [Errors::ParsingError] If any document parsing fails
|
|
268
|
+
# @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
|
|
269
|
+
# @raise [Errors::OCRError] If OCR is enabled and fails on any document
|
|
270
|
+
# @raise [Errors::MissingDependencyError] If a required dependency is missing
|
|
271
|
+
#
|
|
272
|
+
# @example Batch extract binary documents
|
|
273
|
+
# pdf_data_1 = File.read("doc1.pdf", binmode: true)
|
|
274
|
+
# pdf_data_2 = File.read("doc2.pdf", binmode: true)
|
|
275
|
+
# docx_data = File.read("report.docx", binmode: true)
|
|
276
|
+
#
|
|
277
|
+
# data = [pdf_data_1, pdf_data_2, docx_data]
|
|
278
|
+
# types = ["application/pdf", "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
|
|
279
|
+
# results = Kreuzberg.batch_extract_bytes_sync(data, types)
|
|
280
|
+
# results.each { |r| puts r.content }
|
|
62
281
|
def batch_extract_bytes_sync(data_array, mime_types, config: nil)
|
|
63
282
|
opts = normalize_config(config)
|
|
64
283
|
hashes = native_batch_extract_bytes_sync(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
|
|
@@ -67,6 +286,42 @@ module Kreuzberg
|
|
|
67
286
|
results
|
|
68
287
|
end
|
|
69
288
|
|
|
289
|
+
# Asynchronously extract content from multiple byte data sources.
|
|
290
|
+
#
|
|
291
|
+
# Non-blocking batch extraction from multiple in-memory binary documents. Results
|
|
292
|
+
# maintain the same order as the input data array. This method is preferred when
|
|
293
|
+
# processing multiple documents without blocking (e.g., handling multiple uploads).
|
|
294
|
+
#
|
|
295
|
+
# @param data_array [Array<String>] Array of binary document data. Each element can
|
|
296
|
+
# contain any byte values (e.g., PDF binary data).
|
|
297
|
+
# @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
|
|
298
|
+
# Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
|
|
299
|
+
# @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
|
|
300
|
+
# Accepts either a {Config::Extraction} object or a configuration hash.
|
|
301
|
+
#
|
|
302
|
+
# @return [Array<Result>] Array of extraction results in the same order as input data.
|
|
303
|
+
# Array length matches the data_array length.
|
|
304
|
+
#
|
|
305
|
+
# @raise [ArgumentError] If data_array and mime_types have different lengths
|
|
306
|
+
# @raise [Errors::ParsingError] If any document parsing fails
|
|
307
|
+
# @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
|
|
308
|
+
# @raise [Errors::OCRError] If OCR is enabled and fails on any document
|
|
309
|
+
# @raise [Errors::MissingDependencyError] If a required dependency is missing
|
|
310
|
+
#
|
|
311
|
+
# @example Batch extract uploaded documents asynchronously
|
|
312
|
+
# # From a web request with multiple file uploads
|
|
313
|
+
# uploaded_files = params[:files] # Array of uploaded file objects
|
|
314
|
+
# data = uploaded_files.map(&:read)
|
|
315
|
+
# types = uploaded_files.map(&:content_type)
|
|
316
|
+
#
|
|
317
|
+
# results = Kreuzberg.batch_extract_bytes(data, types)
|
|
318
|
+
# results.each { |r| puts r.content }
|
|
319
|
+
#
|
|
320
|
+
# @example Batch extract with OCR
|
|
321
|
+
# data = [scan_1_bytes, scan_2_bytes, scan_3_bytes]
|
|
322
|
+
# types = ["image/png", "image/png", "image/png"]
|
|
323
|
+
# config = Kreuzberg::Config::Extraction.new(force_ocr: true)
|
|
324
|
+
# results = Kreuzberg.batch_extract_bytes(data, types, config: config)
|
|
70
325
|
def batch_extract_bytes(data_array, mime_types, config: nil)
|
|
71
326
|
opts = normalize_config(config)
|
|
72
327
|
hashes = native_batch_extract_bytes(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -23,6 +23,12 @@ module Kreuzberg
|
|
|
23
23
|
|
|
24
24
|
# Alias for API consistency with other language bindings
|
|
25
25
|
ExtractionConfig = Config::Extraction
|
|
26
|
+
PageConfig = Config::PageConfig
|
|
27
|
+
|
|
28
|
+
module KeywordAlgorithm
|
|
29
|
+
YAKE = :yake
|
|
30
|
+
RAKE = :rake
|
|
31
|
+
end
|
|
26
32
|
|
|
27
33
|
@__cache_tracker = { entries: 0, bytes: 0 }
|
|
28
34
|
|
data/lib/libpdfium.dylib
ADDED
|
Binary file
|