kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,24 +1,17 @@
1
- # This crate is excluded from the workspace to use a vendored kreuzberg crate for gem packaging
2
- [workspace]
3
-
4
- [patch.crates-io]
5
- # Patch rb-sys to fix Windows i32/i64 type mismatch in tracking_allocator.rs
6
- rb-sys = { path = "../../../vendor/rb-sys" }
7
-
8
1
  [package]
9
2
  name = "kreuzberg-rb"
10
- version = "4.0.0-rc.6"
3
+ version = "4.0.0-rc.1"
11
4
  edition = "2024"
12
- rust-version = "1.91"
5
+ rust-version = "1.85"
13
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
14
7
  license = "MIT"
15
- repository = "https://github.com/kreuzberg-dev/kreuzberg"
8
+ repository = "https://github.com/Goldziher/kreuzberg"
16
9
  homepage = "https://kreuzberg.dev"
17
10
  documentation = "https://docs.rs/kreuzberg"
18
11
  readme = "README.md"
19
12
  description = "Ruby bindings (Magnus) for Kreuzberg - high-performance document intelligence framework"
20
- keywords = ["ruby", "magnus", "document", "extraction", "bindings"]
21
- categories = ["api-bindings", "text-processing"]
13
+ keywords = ["document", "extraction", "ocr", "pdf", "ruby"]
14
+ categories = ["api-bindings"]
22
15
 
23
16
  [lib]
24
17
  name = "kreuzberg_rb"
@@ -28,17 +21,16 @@ crate-type = ["cdylib", "rlib"]
28
21
  default = []
29
22
 
30
23
  [dependencies]
31
- async-trait = "0.1.89"
32
- kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full", "embeddings"] }
33
- magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
34
- "rb-sys",
35
- ] }
36
- rb-sys = { version = "0.9.119", default-features = false, features = [
37
- "stable-api-compiled-fallback",
38
- ] }
39
- serde_json = "1.0.145"
40
- tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
41
- html-to-markdown-rs = { version = "2.11.1", default-features = false }
24
+ async-trait = "0.1"
25
+ kreuzberg = { version = "4.0.0-rc.1", features = ["full", "embeddings"] }
26
+ magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
27
+ rb-sys = { version = "0.9.117", default-features = false, features = ["stable-api-compiled-fallback"] }
28
+ serde_json = "1.0"
29
+ tokio = { version = "1.48", features = ["rt", "macros"] }
30
+ html-to-markdown-rs = { version = "2.9.1", default-features = false }
42
31
 
43
32
  [dev-dependencies]
44
33
  pretty_assertions = "1.4"
34
+
35
+ [patch.crates-io]
36
+ kreuzberg = { path = "../../../../vendor/kreuzberg" }
@@ -1,12 +1,14 @@
1
1
  #[cfg(target_os = "macos")]
2
2
  fn main() {
3
3
  println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
4
+ // Set rpath to look for libpdfium.dylib in the same directory as the Ruby extension
4
5
  println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
5
6
  println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
6
7
  }
7
8
 
8
9
  #[cfg(target_os = "linux")]
9
10
  fn main() {
11
+ // Set rpath to look for libpdfium.so in the same directory as the Ruby extension
10
12
  println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
11
13
  println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
12
14
  }
@@ -8,4 +8,4 @@
8
8
  #include_next <ieeefp.h>
9
9
  #endif
10
10
 
11
- #endif
11
+ #endif // KREUZBERG_RUBY_IEEFP_H
@@ -11,4 +11,4 @@
11
11
  #define strncasecmp _strnicmp
12
12
  #endif
13
13
 
14
- #endif
14
+ #endif /* KREUZBERG_RB_MSVC_COMPAT_STRINGS_H */
@@ -15,6 +15,6 @@
15
15
  #ifndef bzero
16
16
  #define bzero(ptr, size) memset((ptr), 0, (size))
17
17
  #endif
18
- #endif
18
+ #endif // _MSC_VER
19
19
 
20
- #endif
20
+ #endif // KREUZBERG_RUBY_STRINGS_H
@@ -44,4 +44,4 @@ typedef long ssize_t;
44
44
  #include_next <unistd.h>
45
45
  #endif
46
46
 
47
- #endif
47
+ #endif // KREUZBERG_RUBY_UNISTD_H
@@ -49,33 +49,6 @@ impl Drop for GcGuardedValue {
49
49
  }
50
50
  }
51
51
 
52
- unsafe extern "C" {
53
- fn kreuzberg_last_error_code() -> i32;
54
- fn kreuzberg_last_panic_context() -> *const std::ffi::c_char;
55
- fn kreuzberg_free_string(s: *mut std::ffi::c_char);
56
- }
57
-
58
- /// Retrieve panic context from FFI if available
59
- fn get_panic_context() -> Option<String> {
60
- unsafe {
61
- let ctx_ptr = kreuzberg_last_panic_context();
62
- if ctx_ptr.is_null() {
63
- return None;
64
- }
65
-
66
- let c_str = std::ffi::CStr::from_ptr(ctx_ptr);
67
- let context = c_str.to_string_lossy().to_string();
68
- kreuzberg_free_string(ctx_ptr as *mut std::ffi::c_char);
69
-
70
- if context.is_empty() { None } else { Some(context) }
71
- }
72
- }
73
-
74
- /// Retrieve error code from FFI
75
- fn get_error_code() -> i32 {
76
- unsafe { kreuzberg_last_error_code() }
77
- }
78
-
79
52
  /// Convert Kreuzberg errors to Ruby exceptions
80
53
  fn kreuzberg_error(err: KreuzbergError) -> Error {
81
54
  let ruby = Ruby::get().expect("Ruby not initialized");
@@ -282,10 +255,10 @@ fn ruby_value_to_json(value: Value) -> Result<serde_json::Value, Error> {
282
255
  return Ok(serde_json::Value::Number(serde_json::Number::from(unsigned)));
283
256
  }
284
257
 
285
- if let Ok(float) = f64::try_convert(value)
286
- && let Some(num) = serde_json::Number::from_f64(float)
287
- {
288
- return Ok(serde_json::Value::Number(num));
258
+ if let Ok(float) = f64::try_convert(value) {
259
+ if let Some(num) = serde_json::Number::from_f64(float) {
260
+ return Ok(serde_json::Value::Number(num));
261
+ }
289
262
  }
290
263
 
291
264
  if let Ok(sym) = Symbol::try_convert(value) {
@@ -660,10 +633,10 @@ fn parse_keyword_config(ruby: &Ruby, hash: RHash) -> Result<RustKeywordConfig, E
660
633
  }
661
634
  }
662
635
 
663
- if let Some(val) = get_kw(ruby, hash, "language")
664
- && !val.is_nil()
665
- {
666
- config.language = Some(symbol_to_string(val)?);
636
+ if let Some(val) = get_kw(ruby, hash, "language") {
637
+ if !val.is_nil() {
638
+ config.language = Some(symbol_to_string(val)?);
639
+ }
667
640
  }
668
641
 
669
642
  if let Some(val) = get_kw(ruby, hash, "yake_params")
@@ -2024,6 +1997,9 @@ fn register_post_processor(args: &[Value]) -> Result<(), Error> {
2024
1997
  let processor = self.processor.value();
2025
1998
  let result_clone = result.clone();
2026
1999
 
2000
+ // Use block_in_place to avoid GVL deadlocks (same pattern as Python PostProcessor)
2001
+ // See crates/kreuzberg-py/README.md:151-158 for explanation
2002
+ // CRITICAL: spawn_blocking causes GVL deadlocks, must use block_in_place
2027
2003
  let updated_result = tokio::task::block_in_place(|| {
2028
2004
  let ruby = Ruby::get().expect("Ruby not initialized");
2029
2005
  let result_hash = extraction_result_to_ruby(&ruby, result_clone.clone()).map_err(|e| {
@@ -2230,6 +2206,9 @@ fn register_validator(args: &[Value]) -> Result<(), Error> {
2230
2206
  let validator = self.validator.value();
2231
2207
  let result_clone = result.clone();
2232
2208
 
2209
+ // Use block_in_place to avoid GVL deadlocks (same pattern as Python Validator)
2210
+ // See crates/kreuzberg-py/README.md:151-158 for explanation
2211
+ // CRITICAL: spawn_blocking causes GVL deadlocks, must use block_in_place
2233
2212
  tokio::task::block_in_place(|| {
2234
2213
  let ruby = Ruby::get().expect("Ruby not initialized");
2235
2214
  let result_hash =
@@ -2701,6 +2680,8 @@ fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
2701
2680
  set_hash_entry(ruby, &hash, "chunk_size", preset.chunk_size.into_value_with(ruby))?;
2702
2681
  set_hash_entry(ruby, &hash, "overlap", preset.overlap.into_value_with(ruby))?;
2703
2682
 
2683
+ // Note: When embeddings feature is enabled in kreuzberg, the model field is EmbeddingModel
2684
+ // Since Ruby bindings typically build with all features, we use the model field and format it.
2704
2685
  let model_name = format!("{:?}", preset.model);
2705
2686
 
2706
2687
  set_hash_entry(ruby, &hash, "model_name", ruby.str_new(&model_name).as_value())?;
@@ -2713,43 +2694,6 @@ fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
2713
2694
  }
2714
2695
  }
2715
2696
 
2716
- /// Get the last error code from FFI
2717
- ///
2718
- /// Returns an i32 error code indicating the type of error that occurred:
2719
- /// - 0: Success (no error)
2720
- /// - 1: GenericError
2721
- /// - 2: Panic
2722
- /// - 3: InvalidArgument
2723
- /// - 4: IoError
2724
- /// - 5: ParsingError
2725
- /// - 6: OcrError
2726
- /// - 7: MissingDependency
2727
- ///
2728
- /// @return [Integer] The error code
2729
- fn last_error_code() -> i32 {
2730
- get_error_code()
2731
- }
2732
-
2733
- /// Get the last panic context from FFI as a JSON string
2734
- ///
2735
- /// Returns a JSON string containing panic context if the last error was a panic,
2736
- /// or nil if no panic context is available.
2737
- ///
2738
- /// The JSON structure contains:
2739
- /// - file: Source file where panic occurred
2740
- /// - line: Line number
2741
- /// - function: Function name
2742
- /// - message: Panic message
2743
- /// - timestamp_secs: Unix timestamp
2744
- ///
2745
- /// @return [String, nil] JSON string with panic context or nil
2746
- fn last_panic_context_json(ruby: &Ruby) -> Value {
2747
- match get_panic_context() {
2748
- Some(json) => ruby.str_new(&json).as_value(),
2749
- None => ruby.qnil().as_value(),
2750
- }
2751
- }
2752
-
2753
2697
  /// Initialize the Kreuzberg Ruby module
2754
2698
  #[magnus::init]
2755
2699
  fn init(ruby: &Ruby) -> Result<(), Error> {
@@ -2801,9 +2745,6 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
2801
2745
  module.define_module_function("list_embedding_presets", function!(list_embedding_presets, 0))?;
2802
2746
  module.define_module_function("get_embedding_preset", function!(get_embedding_preset, 1))?;
2803
2747
 
2804
- module.define_module_function("_last_error_code_native", function!(last_error_code, 0))?;
2805
- module.define_module_function("_last_panic_context_json_native", function!(last_panic_context_json, 0))?;
2806
-
2807
2748
  Ok(())
2808
2749
  }
2809
2750
 
data/kreuzberg.gemspec CHANGED
@@ -9,17 +9,17 @@ ruby_prefix = 'packages/ruby/'
9
9
  ruby_cmd = %(git -C "#{repo_root}" ls-files -z #{ruby_prefix})
10
10
  ruby_files =
11
11
  `#{ruby_cmd}`.split("\x0")
12
- .select { |path| path.start_with?(ruby_prefix) }
13
- .map { |path| path.delete_prefix(ruby_prefix) }
12
+ .select { |path| path.start_with?(ruby_prefix) }
13
+ .map { |path| path.delete_prefix(ruby_prefix) }
14
14
 
15
15
  # Include the kreuzberg core crate (needed for path patch in Cargo.toml)
16
16
  core_prefix = 'crates/kreuzberg/'
17
17
  core_cmd = %(git -C "#{repo_root}" ls-files -z #{core_prefix})
18
18
  core_files =
19
19
  `#{core_cmd}`.split("\x0")
20
- .select { |path| path.start_with?(core_prefix) }
21
- .map { |path| path.delete_prefix('crates/') }
22
- .map { |path| "vendor/#{path}" }
20
+ .select { |path| path.start_with?(core_prefix) }
21
+ .map { |path| path.delete_prefix('crates/') }
22
+ .map { |path| "vendor/#{path}" }
23
23
 
24
24
  fallback_files = Dir.chdir(__dir__) do
25
25
  ruby_fallback = Dir.glob(
@@ -44,57 +44,14 @@ fallback_files = Dir.chdir(__dir__) do
44
44
  # Fallback for core crate - copy from repo root
45
45
  core_fallback = Dir.chdir(repo_root) do
46
46
  Dir.glob('crates/kreuzberg/**/*', File::FNM_DOTMATCH)
47
- .reject { |f| File.directory?(f) }
48
- .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
47
+ .reject { |f| File.directory?(f) }
48
+ .map { |path| "vendor/#{path.delete_prefix('crates/')}" }
49
49
  end
50
50
 
51
51
  ruby_fallback + core_fallback
52
52
  end
53
53
 
54
- # Check for vendored crates (copied during CI/packaging)
55
- vendor_files = Dir.chdir(__dir__) do
56
- kreuzberg_files = if Dir.exist?('vendor/kreuzberg')
57
- Dir.glob('vendor/kreuzberg/**/*', File::FNM_DOTMATCH)
58
- .reject { |f| File.directory?(f) }
59
- .reject { |f| f.include?('/.fastembed_cache/') }
60
- .reject { |f| f.include?('/target/') }
61
- .grep_v(/\.(swp|bak|tmp)$/)
62
- .grep_v(/~$/)
63
- else
64
- []
65
- end
66
-
67
- rb_sys_files = if Dir.exist?('vendor/rb-sys')
68
- Dir.glob('vendor/rb-sys/**/*', File::FNM_DOTMATCH)
69
- .reject { |f| File.directory?(f) }
70
- .reject { |f| f.include?('/target/') }
71
- .grep_v(/\.(swp|bak|tmp)$/)
72
- .grep_v(/~$/)
73
- else
74
- []
75
- end
76
-
77
- workspace_toml = if File.exist?('vendor/Cargo.toml')
78
- ['vendor/Cargo.toml']
79
- else
80
- []
81
- end
82
-
83
- kreuzberg_files + rb_sys_files + workspace_toml
84
- end
85
-
86
- # Use git-tracked files if available, otherwise fallback to glob
87
- # Always include vendored files if they exist on disk (for CI packaging)
88
- files = if (ruby_files + core_files).empty?
89
- fallback_files
90
- elsif vendor_files.any?
91
- ruby_files + vendor_files
92
- else
93
- ruby_files + core_files
94
- end
95
-
96
- # Filter to only include files that actually exist
97
- files = files.select { |f| File.exist?(f) }
54
+ files = (ruby_files + core_files).empty? ? fallback_files : (ruby_files + core_files)
98
55
 
99
56
  Gem::Specification.new do |spec|
100
57
  spec.name = 'kreuzberg'
@@ -108,16 +65,16 @@ Gem::Specification.new do |spec|
108
65
  Rust core. Supports extraction, OCR, chunking, and language detection for 30+ file formats
109
66
  including PDF, DOCX, PPTX, XLSX, images, and more.
110
67
  DESC
111
- spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
68
+ spec.homepage = 'https://github.com/Goldziher/kreuzberg'
112
69
  spec.license = 'MIT'
113
70
  spec.required_ruby_version = '>= 3.2.0'
114
71
 
115
72
  spec.metadata = {
116
73
  'homepage_uri' => spec.homepage,
117
- 'source_code_uri' => 'https://github.com/kreuzberg-dev/kreuzberg',
118
- 'changelog_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md',
74
+ 'source_code_uri' => 'https://github.com/Goldziher/kreuzberg',
75
+ 'changelog_uri' => 'https://github.com/Goldziher/kreuzberg/blob/main/CHANGELOG.md',
119
76
  'documentation_uri' => 'https://docs.kreuzberg.dev',
120
- 'bug_tracker_uri' => 'https://github.com/kreuzberg-dev/kreuzberg/issues',
77
+ 'bug_tracker_uri' => 'https://github.com/Goldziher/kreuzberg/issues',
121
78
  'rubygems_mfa_required' => 'true',
122
79
  'keywords' => 'document-intelligence,document-extraction,ocr,rust,bindings'
123
80
  }
@@ -132,10 +89,10 @@ Gem::Specification.new do |spec|
132
89
  # None - the gem is self-contained with the Rust extension
133
90
 
134
91
  # Development dependencies
135
- spec.add_development_dependency 'bundler', '~> 4.0'
92
+ spec.add_development_dependency 'bundler', '~> 2.0'
136
93
  spec.add_development_dependency 'rake', '~> 13.0'
137
94
  spec.add_development_dependency 'rake-compiler', '~> 1.2'
138
- spec.add_development_dependency 'rb_sys', '~> 0.9.119'
95
+ spec.add_development_dependency 'rb_sys', '~> 0.9'
139
96
  spec.add_development_dependency 'rspec', '~> 3.12'
140
97
  unless Gem.win_platform?
141
98
  spec.add_development_dependency 'rbs', '~> 3.0'
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- # Provides caching capabilities for extraction results.
5
4
  module CacheAPI
6
5
  def clear_cache
7
6
  native_clear_cache
data/lib/kreuzberg/cli.rb CHANGED
@@ -22,8 +22,8 @@ module Kreuzberg
22
22
  # @return [String] Extracted content
23
23
  #
24
24
  def extract(path, output: 'text', ocr: false)
25
- args = ['extract', path, '--format', output]
26
- args.push('--ocr', ocr ? 'true' : 'false')
25
+ args = ['extract', path, '--output', output]
26
+ args << '--ocr' if ocr
27
27
  CLIProxy.call(args)
28
28
  end
29
29
 
@@ -39,7 +39,6 @@ module Kreuzberg
39
39
  end
40
40
  end
41
41
 
42
- # Tesseract OCR engine configuration
43
42
  class Tesseract
44
43
  attr_reader :options
45
44
 
@@ -121,7 +120,6 @@ module Kreuzberg
121
120
  end
122
121
  end
123
122
 
124
- # Embedding model configuration for document chunking
125
123
  class Embedding
126
124
  attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
127
125
 
@@ -359,7 +357,6 @@ module Kreuzberg
359
357
  end
360
358
  end
361
359
 
362
- # HTML preprocessing configuration for content extraction
363
360
  class HtmlPreprocessing
364
361
  attr_reader :enabled, :preset, :remove_navigation, :remove_forms
365
362
 
@@ -388,7 +385,6 @@ module Kreuzberg
388
385
  end
389
386
  end
390
387
 
391
- # HTML rendering options for document conversion
392
388
  class HtmlOptions
393
389
  attr_reader :options
394
390
 
@@ -416,7 +412,6 @@ module Kreuzberg
416
412
  end
417
413
  end
418
414
 
419
- # YAKE keyword extraction parameters
420
415
  class KeywordYakeParams
421
416
  attr_reader :window_size
422
417
 
@@ -429,7 +424,6 @@ module Kreuzberg
429
424
  end
430
425
  end
431
426
 
432
- # RAKE keyword extraction parameters
433
427
  class KeywordRakeParams
434
428
  attr_reader :min_word_length, :max_words_per_phrase
435
429
 
@@ -446,7 +440,6 @@ module Kreuzberg
446
440
  end
447
441
  end
448
442
 
449
- # Keyword extraction configuration for document analysis
450
443
  class Keywords
451
444
  attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
452
445
  :language, :yake_params, :rake_params
@@ -652,7 +645,7 @@ module Kreuzberg
652
645
  @max_concurrent_extractions = max_concurrent_extractions&.to_i
653
646
  end
654
647
 
655
- # rubocop:disable Metrics/CyclomaticComplexity
648
+ # rubocop:disable Metrics/PerceivedComplexity
656
649
  def to_h
657
650
  {
658
651
  use_cache: @use_cache,
@@ -671,7 +664,7 @@ module Kreuzberg
671
664
  max_concurrent_extractions: @max_concurrent_extractions
672
665
  }.compact
673
666
  end
674
- # rubocop:enable Metrics/CyclomaticComplexity
667
+ # rubocop:enable Metrics/PerceivedComplexity
675
668
 
676
669
  private
677
670
 
@@ -1,77 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'json'
4
-
5
3
  module Kreuzberg
6
- # Error code constants matching kreuzberg-ffi error codes
7
- ERROR_CODE_SUCCESS = 0
8
- ERROR_CODE_GENERIC = 1
9
- ERROR_CODE_PANIC = 2
10
- ERROR_CODE_INVALID_ARGUMENT = 3
11
- ERROR_CODE_IO = 4
12
- ERROR_CODE_PARSING = 5
13
- ERROR_CODE_OCR = 6
14
- ERROR_CODE_MISSING_DEPENDENCY = 7
15
-
16
4
  module Errors
17
- # Panic context information from FFI error introspection
18
- class PanicContext
19
- attr_reader :file, :line, :function, :message, :timestamp_secs
20
-
21
- def initialize(file:, line:, function:, message:, timestamp_secs:)
22
- @file = file
23
- @line = line
24
- @function = function
25
- @message = message
26
- @timestamp_secs = timestamp_secs
27
- end
28
-
29
- def to_s
30
- "#{file}:#{line}:#{function}: #{message}"
31
- end
32
-
33
- def to_h
34
- {
35
- file:,
36
- line:,
37
- function:,
38
- message:,
39
- timestamp_secs:
40
- }
41
- end
42
-
43
- def self.from_json(json_string)
44
- return nil if json_string.nil? || json_string.empty?
45
-
46
- data = JSON.parse(json_string, symbolize_names: true)
47
- sliced = data.slice(:file, :line, :function, :message, :timestamp_secs)
48
- new(**with_defaults(sliced))
49
- rescue JSON::ParserError
50
- nil
51
- end
52
-
53
- def self.with_defaults(sliced)
54
- {
55
- file: sliced[:file] || '',
56
- line: sliced[:line] || 0,
57
- function: sliced[:function] || '',
58
- message: sliced[:message] || '',
59
- timestamp_secs: sliced[:timestamp_secs] || 0
60
- }
61
- end
62
- private_class_method :with_defaults
63
- end
64
-
65
5
  # Base error class for all Kreuzberg errors
66
- class Error < StandardError
67
- attr_reader :panic_context, :error_code
68
-
69
- def initialize(message, panic_context: nil, error_code: nil)
70
- super(message)
71
- @panic_context = panic_context
72
- @error_code = error_code
73
- end
74
- end
6
+ class Error < StandardError; end
75
7
 
76
8
  # Raised when validation fails
77
9
  class ValidationError < Error; end
@@ -80,8 +12,8 @@ module Kreuzberg
80
12
  class ParsingError < Error
81
13
  attr_reader :context
82
14
 
83
- def initialize(message, context: nil, panic_context: nil, error_code: nil)
84
- super(message, panic_context:, error_code:)
15
+ def initialize(message, context: nil)
16
+ super(message)
85
17
  @context = context
86
18
  end
87
19
  end
@@ -90,8 +22,8 @@ module Kreuzberg
90
22
  class OCRError < Error
91
23
  attr_reader :context
92
24
 
93
- def initialize(message, context: nil, panic_context: nil, error_code: nil)
94
- super(message, panic_context:, error_code:)
25
+ def initialize(message, context: nil)
26
+ super(message)
95
27
  @context = context
96
28
  end
97
29
  end
@@ -100,8 +32,8 @@ module Kreuzberg
100
32
  class MissingDependencyError < Error
101
33
  attr_reader :dependency
102
34
 
103
- def initialize(message, dependency: nil, panic_context: nil, error_code: nil)
104
- super(message, panic_context:, error_code:)
35
+ def initialize(message, dependency: nil)
36
+ super(message)
105
37
  @dependency = dependency
106
38
  end
107
39
  end
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- # Provides extraction methods for documents and text.
5
4
  module ExtractionAPI
6
5
  def extract_file_sync(path, mime_type: nil, config: nil)
7
6
  opts = normalize_config(config)
@@ -4,7 +4,6 @@ require 'rbconfig'
4
4
  require 'open3'
5
5
 
6
6
  module Kreuzberg
7
- # Configures library paths for dynamic linking on different platforms.
8
7
  module SetupLibPath
9
8
  module_function
10
9
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.0.0-rc.6'
4
+ VERSION = '4.0.0.rc1'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -6,8 +6,6 @@ Kreuzberg::SetupLibPath.configure
6
6
  require_relative 'kreuzberg/version'
7
7
  require 'kreuzberg_rb'
8
8
 
9
- # Kreuzberg is a Ruby binding for the Rust core library providing document extraction,
10
- # text extraction, and OCR capabilities.
11
9
  module Kreuzberg
12
10
  autoload :Config, 'kreuzberg/config'
13
11
  autoload :Result, 'kreuzberg/result'
@@ -16,7 +14,6 @@ module Kreuzberg
16
14
  autoload :APIProxy, 'kreuzberg/api_proxy'
17
15
  autoload :MCPProxy, 'kreuzberg/mcp_proxy'
18
16
  autoload :Errors, 'kreuzberg/errors'
19
- autoload :ErrorContext, 'kreuzberg/error_context'
20
17
  autoload :PostProcessorProtocol, 'kreuzberg/post_processor_protocol'
21
18
  autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
22
19
  autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
@@ -76,24 +73,6 @@ module Kreuzberg
76
73
 
77
74
  # List all registered OCR backends.
78
75
  module_function :list_ocr_backends
79
-
80
- # Detect MIME type from file bytes.
81
- module_function :detect_mime_type
82
-
83
- # Detect MIME type from a file path.
84
- module_function :detect_mime_type_from_path
85
-
86
- # Validate a MIME type string.
87
- module_function :validate_mime_type
88
-
89
- # Get file extensions for a given MIME type.
90
- module_function :get_extensions_for_mime
91
-
92
- # List all available embedding presets.
93
- module_function :list_embedding_presets
94
-
95
- # Get a specific embedding preset by name.
96
- module_function :get_embedding_preset
97
76
  end
98
77
 
99
78
  require_relative 'kreuzberg/cache_api'
Binary file