kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +11 -11
- data/README.md +5 -10
- data/examples/async_patterns.rb +0 -1
- data/ext/kreuzberg_rb/extconf.rb +0 -10
- data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
- data/ext/kreuzberg_rb/native/build.rs +2 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
- data/kreuzberg.gemspec +14 -57
- data/lib/kreuzberg/cache_api.rb +0 -1
- data/lib/kreuzberg/cli.rb +2 -2
- data/lib/kreuzberg/config.rb +2 -9
- data/lib/kreuzberg/errors.rb +7 -75
- data/lib/kreuzberg/extraction_api.rb +0 -1
- data/lib/kreuzberg/setup_lib_path.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -21
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +3 -55
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/spec_helper.rb +1 -1
- data/vendor/kreuzberg/Cargo.toml +42 -112
- data/vendor/kreuzberg/README.md +2 -2
- data/vendor/kreuzberg/build.rs +4 -18
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/extractor.rs +81 -202
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +1 -4
- data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
- data/vendor/kreuzberg/src/embeddings.rs +16 -125
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/image.rs +13 -13
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
- data/vendor/kreuzberg/src/extractors/email.rs +0 -14
- data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
- data/vendor/kreuzberg/src/extractors/html.rs +154 -137
- data/vendor/kreuzberg/src/extractors/image.rs +4 -7
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
- data/vendor/kreuzberg/src/extractors/text.rs +5 -23
- data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/lib.rs +1 -4
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
- data/vendor/kreuzberg/src/mcp/server.rs +3 -5
- data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
- data/vendor/kreuzberg/src/pdf/error.rs +1 -1
- data/vendor/kreuzberg/src/pdf/table.rs +44 -17
- data/vendor/kreuzberg/src/pdf/text.rs +3 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/types.rs +12 -42
- data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
- data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
- data/vendor/kreuzberg/tests/config_features.rs +0 -18
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
- data/vendor/kreuzberg/tests/core_integration.rs +7 -24
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -12
- metadata +25 -90
- data/.rubocop.yml +0 -538
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
- data/lib/kreuzberg/error_context.rb +0 -32
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/LICENSE-MIT +0 -21
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -1,24 +1,17 @@
|
|
|
1
|
-
# This crate is excluded from the workspace to use a vendored kreuzberg crate for gem packaging
|
|
2
|
-
[workspace]
|
|
3
|
-
|
|
4
|
-
[patch.crates-io]
|
|
5
|
-
# Patch rb-sys to fix Windows i32/i64 type mismatch in tracking_allocator.rs
|
|
6
|
-
rb-sys = { path = "../../../vendor/rb-sys" }
|
|
7
|
-
|
|
8
1
|
[package]
|
|
9
2
|
name = "kreuzberg-rb"
|
|
10
|
-
version = "4.0.0-rc.
|
|
3
|
+
version = "4.0.0-rc.1"
|
|
11
4
|
edition = "2024"
|
|
12
|
-
rust-version = "1.
|
|
5
|
+
rust-version = "1.85"
|
|
13
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
14
7
|
license = "MIT"
|
|
15
|
-
repository = "https://github.com/
|
|
8
|
+
repository = "https://github.com/Goldziher/kreuzberg"
|
|
16
9
|
homepage = "https://kreuzberg.dev"
|
|
17
10
|
documentation = "https://docs.rs/kreuzberg"
|
|
18
11
|
readme = "README.md"
|
|
19
12
|
description = "Ruby bindings (Magnus) for Kreuzberg - high-performance document intelligence framework"
|
|
20
|
-
keywords = ["
|
|
21
|
-
categories = ["api-bindings"
|
|
13
|
+
keywords = ["document", "extraction", "ocr", "pdf", "ruby"]
|
|
14
|
+
categories = ["api-bindings"]
|
|
22
15
|
|
|
23
16
|
[lib]
|
|
24
17
|
name = "kreuzberg_rb"
|
|
@@ -28,17 +21,16 @@ crate-type = ["cdylib", "rlib"]
|
|
|
28
21
|
default = []
|
|
29
22
|
|
|
30
23
|
[dependencies]
|
|
31
|
-
async-trait = "0.1
|
|
32
|
-
kreuzberg = {
|
|
33
|
-
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
] }
|
|
39
|
-
serde_json = "1.0.145"
|
|
40
|
-
tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
41
|
-
html-to-markdown-rs = { version = "2.11.1", default-features = false }
|
|
24
|
+
async-trait = "0.1"
|
|
25
|
+
kreuzberg = { version = "4.0.0-rc.1", features = ["full", "embeddings"] }
|
|
26
|
+
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
|
|
27
|
+
rb-sys = { version = "0.9.117", default-features = false, features = ["stable-api-compiled-fallback"] }
|
|
28
|
+
serde_json = "1.0"
|
|
29
|
+
tokio = { version = "1.48", features = ["rt", "macros"] }
|
|
30
|
+
html-to-markdown-rs = { version = "2.9.1", default-features = false }
|
|
42
31
|
|
|
43
32
|
[dev-dependencies]
|
|
44
33
|
pretty_assertions = "1.4"
|
|
34
|
+
|
|
35
|
+
[patch.crates-io]
|
|
36
|
+
kreuzberg = { path = "../../../../vendor/kreuzberg" }
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
#[cfg(target_os = "macos")]
|
|
2
2
|
fn main() {
|
|
3
3
|
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
4
|
+
// Set rpath to look for libpdfium.dylib in the same directory as the Ruby extension
|
|
4
5
|
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
5
6
|
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
|
6
7
|
}
|
|
7
8
|
|
|
8
9
|
#[cfg(target_os = "linux")]
|
|
9
10
|
fn main() {
|
|
11
|
+
// Set rpath to look for libpdfium.so in the same directory as the Ruby extension
|
|
10
12
|
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
11
13
|
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
|
12
14
|
}
|
|
@@ -49,33 +49,6 @@ impl Drop for GcGuardedValue {
|
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
51
|
|
|
52
|
-
unsafe extern "C" {
|
|
53
|
-
fn kreuzberg_last_error_code() -> i32;
|
|
54
|
-
fn kreuzberg_last_panic_context() -> *const std::ffi::c_char;
|
|
55
|
-
fn kreuzberg_free_string(s: *mut std::ffi::c_char);
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
/// Retrieve panic context from FFI if available
|
|
59
|
-
fn get_panic_context() -> Option<String> {
|
|
60
|
-
unsafe {
|
|
61
|
-
let ctx_ptr = kreuzberg_last_panic_context();
|
|
62
|
-
if ctx_ptr.is_null() {
|
|
63
|
-
return None;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
let c_str = std::ffi::CStr::from_ptr(ctx_ptr);
|
|
67
|
-
let context = c_str.to_string_lossy().to_string();
|
|
68
|
-
kreuzberg_free_string(ctx_ptr as *mut std::ffi::c_char);
|
|
69
|
-
|
|
70
|
-
if context.is_empty() { None } else { Some(context) }
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
/// Retrieve error code from FFI
|
|
75
|
-
fn get_error_code() -> i32 {
|
|
76
|
-
unsafe { kreuzberg_last_error_code() }
|
|
77
|
-
}
|
|
78
|
-
|
|
79
52
|
/// Convert Kreuzberg errors to Ruby exceptions
|
|
80
53
|
fn kreuzberg_error(err: KreuzbergError) -> Error {
|
|
81
54
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
@@ -282,10 +255,10 @@ fn ruby_value_to_json(value: Value) -> Result<serde_json::Value, Error> {
|
|
|
282
255
|
return Ok(serde_json::Value::Number(serde_json::Number::from(unsigned)));
|
|
283
256
|
}
|
|
284
257
|
|
|
285
|
-
if let Ok(float) = f64::try_convert(value)
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
258
|
+
if let Ok(float) = f64::try_convert(value) {
|
|
259
|
+
if let Some(num) = serde_json::Number::from_f64(float) {
|
|
260
|
+
return Ok(serde_json::Value::Number(num));
|
|
261
|
+
}
|
|
289
262
|
}
|
|
290
263
|
|
|
291
264
|
if let Ok(sym) = Symbol::try_convert(value) {
|
|
@@ -660,10 +633,10 @@ fn parse_keyword_config(ruby: &Ruby, hash: RHash) -> Result<RustKeywordConfig, E
|
|
|
660
633
|
}
|
|
661
634
|
}
|
|
662
635
|
|
|
663
|
-
if let Some(val) = get_kw(ruby, hash, "language")
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
636
|
+
if let Some(val) = get_kw(ruby, hash, "language") {
|
|
637
|
+
if !val.is_nil() {
|
|
638
|
+
config.language = Some(symbol_to_string(val)?);
|
|
639
|
+
}
|
|
667
640
|
}
|
|
668
641
|
|
|
669
642
|
if let Some(val) = get_kw(ruby, hash, "yake_params")
|
|
@@ -2024,6 +1997,9 @@ fn register_post_processor(args: &[Value]) -> Result<(), Error> {
|
|
|
2024
1997
|
let processor = self.processor.value();
|
|
2025
1998
|
let result_clone = result.clone();
|
|
2026
1999
|
|
|
2000
|
+
// Use block_in_place to avoid GVL deadlocks (same pattern as Python PostProcessor)
|
|
2001
|
+
// See crates/kreuzberg-py/README.md:151-158 for explanation
|
|
2002
|
+
// CRITICAL: spawn_blocking causes GVL deadlocks, must use block_in_place
|
|
2027
2003
|
let updated_result = tokio::task::block_in_place(|| {
|
|
2028
2004
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2029
2005
|
let result_hash = extraction_result_to_ruby(&ruby, result_clone.clone()).map_err(|e| {
|
|
@@ -2230,6 +2206,9 @@ fn register_validator(args: &[Value]) -> Result<(), Error> {
|
|
|
2230
2206
|
let validator = self.validator.value();
|
|
2231
2207
|
let result_clone = result.clone();
|
|
2232
2208
|
|
|
2209
|
+
// Use block_in_place to avoid GVL deadlocks (same pattern as Python Validator)
|
|
2210
|
+
// See crates/kreuzberg-py/README.md:151-158 for explanation
|
|
2211
|
+
// CRITICAL: spawn_blocking causes GVL deadlocks, must use block_in_place
|
|
2233
2212
|
tokio::task::block_in_place(|| {
|
|
2234
2213
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2235
2214
|
let result_hash =
|
|
@@ -2701,6 +2680,8 @@ fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
|
|
|
2701
2680
|
set_hash_entry(ruby, &hash, "chunk_size", preset.chunk_size.into_value_with(ruby))?;
|
|
2702
2681
|
set_hash_entry(ruby, &hash, "overlap", preset.overlap.into_value_with(ruby))?;
|
|
2703
2682
|
|
|
2683
|
+
// Note: When embeddings feature is enabled in kreuzberg, the model field is EmbeddingModel
|
|
2684
|
+
// Since Ruby bindings typically build with all features, we use the model field and format it.
|
|
2704
2685
|
let model_name = format!("{:?}", preset.model);
|
|
2705
2686
|
|
|
2706
2687
|
set_hash_entry(ruby, &hash, "model_name", ruby.str_new(&model_name).as_value())?;
|
|
@@ -2713,43 +2694,6 @@ fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
|
|
|
2713
2694
|
}
|
|
2714
2695
|
}
|
|
2715
2696
|
|
|
2716
|
-
/// Get the last error code from FFI
|
|
2717
|
-
///
|
|
2718
|
-
/// Returns an i32 error code indicating the type of error that occurred:
|
|
2719
|
-
/// - 0: Success (no error)
|
|
2720
|
-
/// - 1: GenericError
|
|
2721
|
-
/// - 2: Panic
|
|
2722
|
-
/// - 3: InvalidArgument
|
|
2723
|
-
/// - 4: IoError
|
|
2724
|
-
/// - 5: ParsingError
|
|
2725
|
-
/// - 6: OcrError
|
|
2726
|
-
/// - 7: MissingDependency
|
|
2727
|
-
///
|
|
2728
|
-
/// @return [Integer] The error code
|
|
2729
|
-
fn last_error_code() -> i32 {
|
|
2730
|
-
get_error_code()
|
|
2731
|
-
}
|
|
2732
|
-
|
|
2733
|
-
/// Get the last panic context from FFI as a JSON string
|
|
2734
|
-
///
|
|
2735
|
-
/// Returns a JSON string containing panic context if the last error was a panic,
|
|
2736
|
-
/// or nil if no panic context is available.
|
|
2737
|
-
///
|
|
2738
|
-
/// The JSON structure contains:
|
|
2739
|
-
/// - file: Source file where panic occurred
|
|
2740
|
-
/// - line: Line number
|
|
2741
|
-
/// - function: Function name
|
|
2742
|
-
/// - message: Panic message
|
|
2743
|
-
/// - timestamp_secs: Unix timestamp
|
|
2744
|
-
///
|
|
2745
|
-
/// @return [String, nil] JSON string with panic context or nil
|
|
2746
|
-
fn last_panic_context_json(ruby: &Ruby) -> Value {
|
|
2747
|
-
match get_panic_context() {
|
|
2748
|
-
Some(json) => ruby.str_new(&json).as_value(),
|
|
2749
|
-
None => ruby.qnil().as_value(),
|
|
2750
|
-
}
|
|
2751
|
-
}
|
|
2752
|
-
|
|
2753
2697
|
/// Initialize the Kreuzberg Ruby module
|
|
2754
2698
|
#[magnus::init]
|
|
2755
2699
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
@@ -2801,9 +2745,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
2801
2745
|
module.define_module_function("list_embedding_presets", function!(list_embedding_presets, 0))?;
|
|
2802
2746
|
module.define_module_function("get_embedding_preset", function!(get_embedding_preset, 1))?;
|
|
2803
2747
|
|
|
2804
|
-
module.define_module_function("_last_error_code_native", function!(last_error_code, 0))?;
|
|
2805
|
-
module.define_module_function("_last_panic_context_json_native", function!(last_panic_context_json, 0))?;
|
|
2806
|
-
|
|
2807
2748
|
Ok(())
|
|
2808
2749
|
}
|
|
2809
2750
|
|
data/kreuzberg.gemspec
CHANGED
|
@@ -9,17 +9,17 @@ ruby_prefix = 'packages/ruby/'
|
|
|
9
9
|
ruby_cmd = %(git -C "#{repo_root}" ls-files -z #{ruby_prefix})
|
|
10
10
|
ruby_files =
|
|
11
11
|
`#{ruby_cmd}`.split("\x0")
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
.select { |path| path.start_with?(ruby_prefix) }
|
|
13
|
+
.map { |path| path.delete_prefix(ruby_prefix) }
|
|
14
14
|
|
|
15
15
|
# Include the kreuzberg core crate (needed for path patch in Cargo.toml)
|
|
16
16
|
core_prefix = 'crates/kreuzberg/'
|
|
17
17
|
core_cmd = %(git -C "#{repo_root}" ls-files -z #{core_prefix})
|
|
18
18
|
core_files =
|
|
19
19
|
`#{core_cmd}`.split("\x0")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
.select { |path| path.start_with?(core_prefix) }
|
|
21
|
+
.map { |path| path.delete_prefix('crates/') }
|
|
22
|
+
.map { |path| "vendor/#{path}" }
|
|
23
23
|
|
|
24
24
|
fallback_files = Dir.chdir(__dir__) do
|
|
25
25
|
ruby_fallback = Dir.glob(
|
|
@@ -44,57 +44,14 @@ fallback_files = Dir.chdir(__dir__) do
|
|
|
44
44
|
# Fallback for core crate - copy from repo root
|
|
45
45
|
core_fallback = Dir.chdir(repo_root) do
|
|
46
46
|
Dir.glob('crates/kreuzberg/**/*', File::FNM_DOTMATCH)
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
.reject { |f| File.directory?(f) }
|
|
48
|
+
.map { |path| "vendor/#{path.delete_prefix('crates/')}" }
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
ruby_fallback + core_fallback
|
|
52
52
|
end
|
|
53
53
|
|
|
54
|
-
|
|
55
|
-
vendor_files = Dir.chdir(__dir__) do
|
|
56
|
-
kreuzberg_files = if Dir.exist?('vendor/kreuzberg')
|
|
57
|
-
Dir.glob('vendor/kreuzberg/**/*', File::FNM_DOTMATCH)
|
|
58
|
-
.reject { |f| File.directory?(f) }
|
|
59
|
-
.reject { |f| f.include?('/.fastembed_cache/') }
|
|
60
|
-
.reject { |f| f.include?('/target/') }
|
|
61
|
-
.grep_v(/\.(swp|bak|tmp)$/)
|
|
62
|
-
.grep_v(/~$/)
|
|
63
|
-
else
|
|
64
|
-
[]
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
rb_sys_files = if Dir.exist?('vendor/rb-sys')
|
|
68
|
-
Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
|
|
69
|
-
.reject { |f| File.directory?(f) }
|
|
70
|
-
.reject { |f| f.include?('/target/') }
|
|
71
|
-
.grep_v(/\.(swp|bak|tmp)$/)
|
|
72
|
-
.grep_v(/~$/)
|
|
73
|
-
else
|
|
74
|
-
[]
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
workspace_toml = if File.exist?('vendor/Cargo.toml')
|
|
78
|
-
['vendor/Cargo.toml']
|
|
79
|
-
else
|
|
80
|
-
[]
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
kreuzberg_files + rb_sys_files + workspace_toml
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
# Use git-tracked files if available, otherwise fallback to glob
|
|
87
|
-
# Always include vendored files if they exist on disk (for CI packaging)
|
|
88
|
-
files = if (ruby_files + core_files).empty?
|
|
89
|
-
fallback_files
|
|
90
|
-
elsif vendor_files.any?
|
|
91
|
-
ruby_files + vendor_files
|
|
92
|
-
else
|
|
93
|
-
ruby_files + core_files
|
|
94
|
-
end
|
|
95
|
-
|
|
96
|
-
# Filter to only include files that actually exist
|
|
97
|
-
files = files.select { |f| File.exist?(f) }
|
|
54
|
+
files = (ruby_files + core_files).empty? ? fallback_files : (ruby_files + core_files)
|
|
98
55
|
|
|
99
56
|
Gem::Specification.new do |spec|
|
|
100
57
|
spec.name = 'kreuzberg'
|
|
@@ -108,16 +65,16 @@ Gem::Specification.new do |spec|
|
|
|
108
65
|
Rust core. Supports extraction, OCR, chunking, and language detection for 30+ file formats
|
|
109
66
|
including PDF, DOCX, PPTX, XLSX, images, and more.
|
|
110
67
|
DESC
|
|
111
|
-
spec.homepage = 'https://github.com/
|
|
68
|
+
spec.homepage = 'https://github.com/Goldziher/kreuzberg'
|
|
112
69
|
spec.license = 'MIT'
|
|
113
70
|
spec.required_ruby_version = '>= 3.2.0'
|
|
114
71
|
|
|
115
72
|
spec.metadata = {
|
|
116
73
|
'homepage_uri' => spec.homepage,
|
|
117
|
-
'source_code_uri' => 'https://github.com/
|
|
118
|
-
'changelog_uri' => 'https://github.com/
|
|
74
|
+
'source_code_uri' => 'https://github.com/Goldziher/kreuzberg',
|
|
75
|
+
'changelog_uri' => 'https://github.com/Goldziher/kreuzberg/blob/main/CHANGELOG.md',
|
|
119
76
|
'documentation_uri' => 'https://docs.kreuzberg.dev',
|
|
120
|
-
'bug_tracker_uri' => 'https://github.com/
|
|
77
|
+
'bug_tracker_uri' => 'https://github.com/Goldziher/kreuzberg/issues',
|
|
121
78
|
'rubygems_mfa_required' => 'true',
|
|
122
79
|
'keywords' => 'document-intelligence,document-extraction,ocr,rust,bindings'
|
|
123
80
|
}
|
|
@@ -132,10 +89,10 @@ Gem::Specification.new do |spec|
|
|
|
132
89
|
# None - the gem is self-contained with the Rust extension
|
|
133
90
|
|
|
134
91
|
# Development dependencies
|
|
135
|
-
spec.add_development_dependency 'bundler', '~>
|
|
92
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
|
136
93
|
spec.add_development_dependency 'rake', '~> 13.0'
|
|
137
94
|
spec.add_development_dependency 'rake-compiler', '~> 1.2'
|
|
138
|
-
spec.add_development_dependency 'rb_sys', '~> 0.9
|
|
95
|
+
spec.add_development_dependency 'rb_sys', '~> 0.9'
|
|
139
96
|
spec.add_development_dependency 'rspec', '~> 3.12'
|
|
140
97
|
unless Gem.win_platform?
|
|
141
98
|
spec.add_development_dependency 'rbs', '~> 3.0'
|
data/lib/kreuzberg/cache_api.rb
CHANGED
data/lib/kreuzberg/cli.rb
CHANGED
|
@@ -22,8 +22,8 @@ module Kreuzberg
|
|
|
22
22
|
# @return [String] Extracted content
|
|
23
23
|
#
|
|
24
24
|
def extract(path, output: 'text', ocr: false)
|
|
25
|
-
args = ['extract', path, '--
|
|
26
|
-
args
|
|
25
|
+
args = ['extract', path, '--output', output]
|
|
26
|
+
args << '--ocr' if ocr
|
|
27
27
|
CLIProxy.call(args)
|
|
28
28
|
end
|
|
29
29
|
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -39,7 +39,6 @@ module Kreuzberg
|
|
|
39
39
|
end
|
|
40
40
|
end
|
|
41
41
|
|
|
42
|
-
# Tesseract OCR engine configuration
|
|
43
42
|
class Tesseract
|
|
44
43
|
attr_reader :options
|
|
45
44
|
|
|
@@ -121,7 +120,6 @@ module Kreuzberg
|
|
|
121
120
|
end
|
|
122
121
|
end
|
|
123
122
|
|
|
124
|
-
# Embedding model configuration for document chunking
|
|
125
123
|
class Embedding
|
|
126
124
|
attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
|
|
127
125
|
|
|
@@ -359,7 +357,6 @@ module Kreuzberg
|
|
|
359
357
|
end
|
|
360
358
|
end
|
|
361
359
|
|
|
362
|
-
# HTML preprocessing configuration for content extraction
|
|
363
360
|
class HtmlPreprocessing
|
|
364
361
|
attr_reader :enabled, :preset, :remove_navigation, :remove_forms
|
|
365
362
|
|
|
@@ -388,7 +385,6 @@ module Kreuzberg
|
|
|
388
385
|
end
|
|
389
386
|
end
|
|
390
387
|
|
|
391
|
-
# HTML rendering options for document conversion
|
|
392
388
|
class HtmlOptions
|
|
393
389
|
attr_reader :options
|
|
394
390
|
|
|
@@ -416,7 +412,6 @@ module Kreuzberg
|
|
|
416
412
|
end
|
|
417
413
|
end
|
|
418
414
|
|
|
419
|
-
# YAKE keyword extraction parameters
|
|
420
415
|
class KeywordYakeParams
|
|
421
416
|
attr_reader :window_size
|
|
422
417
|
|
|
@@ -429,7 +424,6 @@ module Kreuzberg
|
|
|
429
424
|
end
|
|
430
425
|
end
|
|
431
426
|
|
|
432
|
-
# RAKE keyword extraction parameters
|
|
433
427
|
class KeywordRakeParams
|
|
434
428
|
attr_reader :min_word_length, :max_words_per_phrase
|
|
435
429
|
|
|
@@ -446,7 +440,6 @@ module Kreuzberg
|
|
|
446
440
|
end
|
|
447
441
|
end
|
|
448
442
|
|
|
449
|
-
# Keyword extraction configuration for document analysis
|
|
450
443
|
class Keywords
|
|
451
444
|
attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
|
|
452
445
|
:language, :yake_params, :rake_params
|
|
@@ -652,7 +645,7 @@ module Kreuzberg
|
|
|
652
645
|
@max_concurrent_extractions = max_concurrent_extractions&.to_i
|
|
653
646
|
end
|
|
654
647
|
|
|
655
|
-
# rubocop:disable Metrics/
|
|
648
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
|
656
649
|
def to_h
|
|
657
650
|
{
|
|
658
651
|
use_cache: @use_cache,
|
|
@@ -671,7 +664,7 @@ module Kreuzberg
|
|
|
671
664
|
max_concurrent_extractions: @max_concurrent_extractions
|
|
672
665
|
}.compact
|
|
673
666
|
end
|
|
674
|
-
# rubocop:enable Metrics/
|
|
667
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
|
675
668
|
|
|
676
669
|
private
|
|
677
670
|
|
data/lib/kreuzberg/errors.rb
CHANGED
|
@@ -1,77 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'json'
|
|
4
|
-
|
|
5
3
|
module Kreuzberg
|
|
6
|
-
# Error code constants matching kreuzberg-ffi error codes
|
|
7
|
-
ERROR_CODE_SUCCESS = 0
|
|
8
|
-
ERROR_CODE_GENERIC = 1
|
|
9
|
-
ERROR_CODE_PANIC = 2
|
|
10
|
-
ERROR_CODE_INVALID_ARGUMENT = 3
|
|
11
|
-
ERROR_CODE_IO = 4
|
|
12
|
-
ERROR_CODE_PARSING = 5
|
|
13
|
-
ERROR_CODE_OCR = 6
|
|
14
|
-
ERROR_CODE_MISSING_DEPENDENCY = 7
|
|
15
|
-
|
|
16
4
|
module Errors
|
|
17
|
-
# Panic context information from FFI error introspection
|
|
18
|
-
class PanicContext
|
|
19
|
-
attr_reader :file, :line, :function, :message, :timestamp_secs
|
|
20
|
-
|
|
21
|
-
def initialize(file:, line:, function:, message:, timestamp_secs:)
|
|
22
|
-
@file = file
|
|
23
|
-
@line = line
|
|
24
|
-
@function = function
|
|
25
|
-
@message = message
|
|
26
|
-
@timestamp_secs = timestamp_secs
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
def to_s
|
|
30
|
-
"#{file}:#{line}:#{function}: #{message}"
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
def to_h
|
|
34
|
-
{
|
|
35
|
-
file:,
|
|
36
|
-
line:,
|
|
37
|
-
function:,
|
|
38
|
-
message:,
|
|
39
|
-
timestamp_secs:
|
|
40
|
-
}
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
def self.from_json(json_string)
|
|
44
|
-
return nil if json_string.nil? || json_string.empty?
|
|
45
|
-
|
|
46
|
-
data = JSON.parse(json_string, symbolize_names: true)
|
|
47
|
-
sliced = data.slice(:file, :line, :function, :message, :timestamp_secs)
|
|
48
|
-
new(**with_defaults(sliced))
|
|
49
|
-
rescue JSON::ParserError
|
|
50
|
-
nil
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
def self.with_defaults(sliced)
|
|
54
|
-
{
|
|
55
|
-
file: sliced[:file] || '',
|
|
56
|
-
line: sliced[:line] || 0,
|
|
57
|
-
function: sliced[:function] || '',
|
|
58
|
-
message: sliced[:message] || '',
|
|
59
|
-
timestamp_secs: sliced[:timestamp_secs] || 0
|
|
60
|
-
}
|
|
61
|
-
end
|
|
62
|
-
private_class_method :with_defaults
|
|
63
|
-
end
|
|
64
|
-
|
|
65
5
|
# Base error class for all Kreuzberg errors
|
|
66
|
-
class Error < StandardError
|
|
67
|
-
attr_reader :panic_context, :error_code
|
|
68
|
-
|
|
69
|
-
def initialize(message, panic_context: nil, error_code: nil)
|
|
70
|
-
super(message)
|
|
71
|
-
@panic_context = panic_context
|
|
72
|
-
@error_code = error_code
|
|
73
|
-
end
|
|
74
|
-
end
|
|
6
|
+
class Error < StandardError; end
|
|
75
7
|
|
|
76
8
|
# Raised when validation fails
|
|
77
9
|
class ValidationError < Error; end
|
|
@@ -80,8 +12,8 @@ module Kreuzberg
|
|
|
80
12
|
class ParsingError < Error
|
|
81
13
|
attr_reader :context
|
|
82
14
|
|
|
83
|
-
def initialize(message, context: nil
|
|
84
|
-
super(message
|
|
15
|
+
def initialize(message, context: nil)
|
|
16
|
+
super(message)
|
|
85
17
|
@context = context
|
|
86
18
|
end
|
|
87
19
|
end
|
|
@@ -90,8 +22,8 @@ module Kreuzberg
|
|
|
90
22
|
class OCRError < Error
|
|
91
23
|
attr_reader :context
|
|
92
24
|
|
|
93
|
-
def initialize(message, context: nil
|
|
94
|
-
super(message
|
|
25
|
+
def initialize(message, context: nil)
|
|
26
|
+
super(message)
|
|
95
27
|
@context = context
|
|
96
28
|
end
|
|
97
29
|
end
|
|
@@ -100,8 +32,8 @@ module Kreuzberg
|
|
|
100
32
|
class MissingDependencyError < Error
|
|
101
33
|
attr_reader :dependency
|
|
102
34
|
|
|
103
|
-
def initialize(message, dependency: nil
|
|
104
|
-
super(message
|
|
35
|
+
def initialize(message, dependency: nil)
|
|
36
|
+
super(message)
|
|
105
37
|
@dependency = dependency
|
|
106
38
|
end
|
|
107
39
|
end
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -6,8 +6,6 @@ Kreuzberg::SetupLibPath.configure
|
|
|
6
6
|
require_relative 'kreuzberg/version'
|
|
7
7
|
require 'kreuzberg_rb'
|
|
8
8
|
|
|
9
|
-
# Kreuzberg is a Ruby binding for the Rust core library providing document extraction,
|
|
10
|
-
# text extraction, and OCR capabilities.
|
|
11
9
|
module Kreuzberg
|
|
12
10
|
autoload :Config, 'kreuzberg/config'
|
|
13
11
|
autoload :Result, 'kreuzberg/result'
|
|
@@ -16,7 +14,6 @@ module Kreuzberg
|
|
|
16
14
|
autoload :APIProxy, 'kreuzberg/api_proxy'
|
|
17
15
|
autoload :MCPProxy, 'kreuzberg/mcp_proxy'
|
|
18
16
|
autoload :Errors, 'kreuzberg/errors'
|
|
19
|
-
autoload :ErrorContext, 'kreuzberg/error_context'
|
|
20
17
|
autoload :PostProcessorProtocol, 'kreuzberg/post_processor_protocol'
|
|
21
18
|
autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
|
|
22
19
|
autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
|
|
@@ -76,24 +73,6 @@ module Kreuzberg
|
|
|
76
73
|
|
|
77
74
|
# List all registered OCR backends.
|
|
78
75
|
module_function :list_ocr_backends
|
|
79
|
-
|
|
80
|
-
# Detect MIME type from file bytes.
|
|
81
|
-
module_function :detect_mime_type
|
|
82
|
-
|
|
83
|
-
# Detect MIME type from a file path.
|
|
84
|
-
module_function :detect_mime_type_from_path
|
|
85
|
-
|
|
86
|
-
# Validate a MIME type string.
|
|
87
|
-
module_function :validate_mime_type
|
|
88
|
-
|
|
89
|
-
# Get file extensions for a given MIME type.
|
|
90
|
-
module_function :get_extensions_for_mime
|
|
91
|
-
|
|
92
|
-
# List all available embedding presets.
|
|
93
|
-
module_function :list_embedding_presets
|
|
94
|
-
|
|
95
|
-
# Get a specific embedding preset by name.
|
|
96
|
-
module_function :get_embedding_preset
|
|
97
76
|
end
|
|
98
77
|
|
|
99
78
|
require_relative 'kreuzberg/cache_api'
|
|
Binary file
|