kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,5 +1,3 @@
1
- #![allow(unpredictable_function_pointer_comparisons)]
2
-
3
1
  //! Kreuzberg Ruby Bindings (Magnus)
4
2
  //!
5
3
  //! High-performance document intelligence framework bindings for Ruby.
@@ -9,59 +7,23 @@ use html_to_markdown_rs::options::{
9
7
  CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingPreset,
10
8
  WhitespaceMode,
11
9
  };
12
- use kreuzberg::core::config::PageConfig;
13
10
  use kreuzberg::keywords::{
14
11
  KeywordAlgorithm as RustKeywordAlgorithm, KeywordConfig as RustKeywordConfig, RakeParams as RustRakeParams,
15
12
  YakeParams as RustYakeParams,
16
13
  };
17
14
  use kreuzberg::types::TesseractConfig as RustTesseractConfig;
18
- use kreuzberg::pdf::HierarchyConfig;
19
15
  use kreuzberg::{
20
- ChunkingConfig, EmbeddingConfig, ExtractionConfig, ExtractionResult as RustExtractionResult,
21
- ImageExtractionConfig, ImagePreprocessingConfig, KreuzbergError, LanguageDetectionConfig, OcrConfig, PdfConfig,
22
- PostProcessorConfig, TokenReductionConfig,
16
+ ChunkingConfig, EmbeddingConfig, ExtractionConfig, ExtractionResult as RustExtractionResult, ImageExtractionConfig,
17
+ ImagePreprocessingConfig, KreuzbergError, LanguageDetectionConfig, OcrConfig, PdfConfig, PostProcessorConfig,
18
+ TokenReductionConfig,
23
19
  };
24
20
  use magnus::exception::ExceptionClass;
25
21
  use magnus::r_hash::ForEach;
26
22
  use magnus::value::ReprValue;
27
- use magnus::{
28
- Error, IntoValue, RArray, RHash, RString, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args,
29
- };
23
+ use magnus::{Error, IntoValue, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
30
24
  use std::fs;
31
25
  use std::path::{Path, PathBuf};
32
26
 
33
- // Re-export FFI types and functions from kreuzberg_ffi crate.
34
- // This ensures proper linking by importing Rust symbols directly
35
- // instead of declaring them as external C symbols.
36
- pub use kreuzberg_ffi::{
37
- // Types
38
- CErrorDetails, CMetadataField,
39
- // Panic/error handling (from panic_shield module)
40
- get_last_error_code, get_last_error_message, get_last_panic_context,
41
- // Error functions (from error module)
42
- kreuzberg_get_error_details, kreuzberg_classify_error,
43
- kreuzberg_error_code_name, kreuzberg_error_code_description,
44
- // Result functions (from result module)
45
- kreuzberg_result_get_page_count, kreuzberg_result_get_chunk_count,
46
- kreuzberg_result_get_detected_language, kreuzberg_result_get_metadata_field,
47
- // Memory and util functions (from lib.rs)
48
- kreuzberg_free_string, kreuzberg_last_error, kreuzberg_last_error_code,
49
- kreuzberg_last_panic_context,
50
- // Validation functions (from lib.rs)
51
- kreuzberg_validate_binarization_method, kreuzberg_validate_ocr_backend,
52
- kreuzberg_validate_language_code, kreuzberg_validate_token_reduction_level,
53
- kreuzberg_validate_tesseract_psm, kreuzberg_validate_tesseract_oem,
54
- kreuzberg_validate_output_format, kreuzberg_validate_confidence,
55
- kreuzberg_validate_dpi, kreuzberg_validate_chunking_params,
56
- kreuzberg_get_valid_binarization_methods, kreuzberg_get_valid_language_codes,
57
- kreuzberg_get_valid_ocr_backends, kreuzberg_get_valid_token_reduction_levels,
58
- // Config functions (from config module, now re-exported through lib.rs)
59
- kreuzberg_config_from_json, kreuzberg_config_free, kreuzberg_config_is_valid,
60
- kreuzberg_config_to_json, kreuzberg_config_get_field, kreuzberg_config_merge,
61
- };
62
-
63
- use std::ffi::c_char;
64
-
65
27
  /// Keeps Ruby values alive across plugin registrations by informing the GC.
66
28
  struct GcGuardedValue {
67
29
  value: Value,
@@ -87,27 +49,6 @@ impl Drop for GcGuardedValue {
87
49
  }
88
50
  }
89
51
 
90
- /// Retrieve panic context from FFI if available
91
- fn get_panic_context() -> Option<String> {
92
- unsafe {
93
- let ctx_ptr = kreuzberg_last_panic_context();
94
- if ctx_ptr.is_null() {
95
- return None;
96
- }
97
-
98
- let c_str = std::ffi::CStr::from_ptr(ctx_ptr);
99
- let context = c_str.to_string_lossy().to_string();
100
- kreuzberg_free_string(ctx_ptr as *mut std::ffi::c_char);
101
-
102
- if context.is_empty() { None } else { Some(context) }
103
- }
104
- }
105
-
106
- /// Retrieve error code from FFI
107
- fn get_error_code() -> i32 {
108
- unsafe { kreuzberg_last_error_code() }
109
- }
110
-
111
52
  /// Convert Kreuzberg errors to Ruby exceptions
112
53
  fn kreuzberg_error(err: KreuzbergError) -> Error {
113
54
  let ruby = Ruby::get().expect("Ruby not initialized");
@@ -314,10 +255,10 @@ fn ruby_value_to_json(value: Value) -> Result<serde_json::Value, Error> {
314
255
  return Ok(serde_json::Value::Number(serde_json::Number::from(unsigned)));
315
256
  }
316
257
 
317
- if let Ok(float) = f64::try_convert(value)
318
- && let Some(num) = serde_json::Number::from_f64(float)
319
- {
320
- return Ok(serde_json::Value::Number(num));
258
+ if let Ok(float) = f64::try_convert(value) {
259
+ if let Some(num) = serde_json::Number::from_f64(float) {
260
+ return Ok(serde_json::Value::Number(num));
261
+ }
321
262
  }
322
263
 
323
264
  if let Ok(sym) = Symbol::try_convert(value) {
@@ -455,46 +396,6 @@ fn parse_language_detection_config(ruby: &Ruby, hash: RHash) -> Result<LanguageD
455
396
  Ok(config)
456
397
  }
457
398
 
458
- /// Parse HierarchyConfig from Ruby Hash
459
- fn parse_hierarchy_config(ruby: &Ruby, hash: RHash) -> Result<HierarchyConfig, Error> {
460
- let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
461
- bool::try_convert(val)?
462
- } else {
463
- true
464
- };
465
-
466
- let k_clusters = if let Some(val) = get_kw(ruby, hash, "k_clusters") {
467
- usize::try_convert(val)?
468
- } else {
469
- 6
470
- };
471
-
472
- let include_bbox = if let Some(val) = get_kw(ruby, hash, "include_bbox") {
473
- bool::try_convert(val)?
474
- } else {
475
- true
476
- };
477
-
478
- let ocr_coverage_threshold = if let Some(val) = get_kw(ruby, hash, "ocr_coverage_threshold") {
479
- if !val.is_nil() {
480
- Some(f64::try_convert(val)? as f32)
481
- } else {
482
- None
483
- }
484
- } else {
485
- None
486
- };
487
-
488
- let config = HierarchyConfig {
489
- enabled,
490
- k_clusters,
491
- include_bbox,
492
- ocr_coverage_threshold,
493
- };
494
-
495
- Ok(config)
496
- }
497
-
498
399
  /// Parse PdfConfig from Ruby Hash
499
400
  fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
500
401
  let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
@@ -520,22 +421,10 @@ fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
520
421
  true
521
422
  };
522
423
 
523
- let hierarchy = if let Some(val) = get_kw(ruby, hash, "hierarchy") {
524
- if !val.is_nil() {
525
- let h_hash = RHash::try_convert(val)?;
526
- Some(parse_hierarchy_config(ruby, h_hash)?)
527
- } else {
528
- None
529
- }
530
- } else {
531
- None
532
- };
533
-
534
424
  let config = PdfConfig {
535
425
  extract_images,
536
426
  passwords,
537
427
  extract_metadata,
538
- hierarchy,
539
428
  };
540
429
 
541
430
  Ok(config)
@@ -682,8 +571,6 @@ fn parse_postprocessor_config(ruby: &Ruby, hash: RHash) -> Result<PostProcessorC
682
571
  enabled,
683
572
  enabled_processors,
684
573
  disabled_processors,
685
- enabled_set: None,
686
- disabled_set: None,
687
574
  };
688
575
 
689
576
  Ok(config)
@@ -746,10 +633,10 @@ fn parse_keyword_config(ruby: &Ruby, hash: RHash) -> Result<RustKeywordConfig, E
746
633
  }
747
634
  }
748
635
 
749
- if let Some(val) = get_kw(ruby, hash, "language")
750
- && !val.is_nil()
751
- {
752
- config.language = Some(symbol_to_string(val)?);
636
+ if let Some(val) = get_kw(ruby, hash, "language") {
637
+ if !val.is_nil() {
638
+ config.language = Some(symbol_to_string(val)?);
639
+ }
753
640
  }
754
641
 
755
642
  if let Some(val) = get_kw(ruby, hash, "yake_params")
@@ -1136,36 +1023,6 @@ fn html_options_to_ruby_hash(ruby: &Ruby, options: &ConversionOptions) -> Result
1136
1023
 
1137
1024
  Ok(hash)
1138
1025
  }
1139
-
1140
- /// Parse PageConfig from Ruby Hash
1141
- fn parse_page_config(ruby: &Ruby, hash: RHash) -> Result<PageConfig, Error> {
1142
- let extract_pages = if let Some(val) = get_kw(ruby, hash, "extract_pages") {
1143
- bool::try_convert(val)?
1144
- } else {
1145
- false
1146
- };
1147
-
1148
- let insert_page_markers = if let Some(val) = get_kw(ruby, hash, "insert_page_markers") {
1149
- bool::try_convert(val)?
1150
- } else {
1151
- false
1152
- };
1153
-
1154
- let marker_format = if let Some(val) = get_kw(ruby, hash, "marker_format") {
1155
- String::try_convert(val)?
1156
- } else {
1157
- "\n\n<!-- PAGE {page_num} -->\n\n".to_string()
1158
- };
1159
-
1160
- let config = PageConfig {
1161
- extract_pages,
1162
- insert_page_markers,
1163
- marker_format,
1164
- };
1165
-
1166
- Ok(config)
1167
- }
1168
-
1169
1026
  /// Parse ExtractionConfig from Ruby Hash
1170
1027
  fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
1171
1028
  let mut config = ExtractionConfig::default();
@@ -1246,13 +1103,6 @@ fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<Extractio
1246
1103
  config.html_options = Some(parse_html_options(ruby, html_hash)?);
1247
1104
  }
1248
1105
 
1249
- if let Some(val) = get_kw(ruby, hash, "pages")
1250
- && !val.is_nil()
1251
- {
1252
- let pages_hash = RHash::try_convert(val)?;
1253
- config.pages = Some(parse_page_config(ruby, pages_hash)?);
1254
- }
1255
-
1256
1106
  if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
1257
1107
  let value = usize::try_convert(val)?;
1258
1108
  config.max_concurrent_extractions = Some(value);
@@ -1655,8 +1505,8 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
1655
1505
  for chunk in chunks {
1656
1506
  let chunk_hash = ruby.hash_new();
1657
1507
  chunk_hash.aset("content", chunk.content)?;
1658
- chunk_hash.aset("byte_start", chunk.metadata.byte_start)?;
1659
- chunk_hash.aset("byte_end", chunk.metadata.byte_end)?;
1508
+ chunk_hash.aset("char_start", chunk.metadata.char_start)?;
1509
+ chunk_hash.aset("char_end", chunk.metadata.char_end)?;
1660
1510
  if let Some(token_count) = chunk.metadata.token_count {
1661
1511
  chunk_hash.aset("token_count", token_count)?;
1662
1512
  } else {
@@ -1664,16 +1514,6 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
1664
1514
  }
1665
1515
  chunk_hash.aset("chunk_index", chunk.metadata.chunk_index)?;
1666
1516
  chunk_hash.aset("total_chunks", chunk.metadata.total_chunks)?;
1667
- if let Some(first_page) = chunk.metadata.first_page {
1668
- chunk_hash.aset("first_page", first_page as i64)?;
1669
- } else {
1670
- chunk_hash.aset("first_page", ruby.qnil().as_value())?;
1671
- }
1672
- if let Some(last_page) = chunk.metadata.last_page {
1673
- chunk_hash.aset("last_page", last_page as i64)?;
1674
- } else {
1675
- chunk_hash.aset("last_page", ruby.qnil().as_value())?;
1676
- }
1677
1517
  if let Some(embedding) = chunk.embedding {
1678
1518
  let embedding_array = ruby.ary_new();
1679
1519
  for value in embedding {
@@ -1750,92 +1590,6 @@ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Resul
1750
1590
  set_hash_entry(ruby, &hash, "images", ruby.qnil().as_value())?;
1751
1591
  }
1752
1592
 
1753
- if let Some(page_content_list) = result.pages {
1754
- let pages_array = ruby.ary_new();
1755
- for page_content in page_content_list {
1756
- let page_hash = ruby.hash_new();
1757
- page_hash.aset("page_number", page_content.page_number as i64)?;
1758
- page_hash.aset("content", page_content.content)?;
1759
-
1760
- let tables_array = ruby.ary_new();
1761
- for table in page_content.tables {
1762
- let table_hash = ruby.hash_new();
1763
-
1764
- let cells_array = ruby.ary_new();
1765
- for row in table.cells.clone() {
1766
- let row_array = ruby.ary_from_vec(row);
1767
- cells_array.push(row_array)?;
1768
- }
1769
- table_hash.aset("cells", cells_array)?;
1770
- table_hash.aset("markdown", table.markdown.clone())?;
1771
- table_hash.aset("page_number", table.page_number as i64)?;
1772
-
1773
- tables_array.push(table_hash)?;
1774
- }
1775
- page_hash.aset("tables", tables_array)?;
1776
-
1777
- let images_array = ruby.ary_new();
1778
- for image in page_content.images {
1779
- let image_hash = ruby.hash_new();
1780
- let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
1781
- image_hash.aset("data", data_value)?;
1782
- image_hash.aset("format", image.format.clone())?;
1783
- image_hash.aset("image_index", image.image_index as i64)?;
1784
- if let Some(page) = image.page_number {
1785
- image_hash.aset("page_number", page as i64)?;
1786
- } else {
1787
- image_hash.aset("page_number", ruby.qnil().as_value())?;
1788
- }
1789
- if let Some(width) = image.width {
1790
- image_hash.aset("width", width as i64)?;
1791
- } else {
1792
- image_hash.aset("width", ruby.qnil().as_value())?;
1793
- }
1794
- if let Some(height) = image.height {
1795
- image_hash.aset("height", height as i64)?;
1796
- } else {
1797
- image_hash.aset("height", ruby.qnil().as_value())?;
1798
- }
1799
- if let Some(colorspace) = &image.colorspace {
1800
- image_hash.aset("colorspace", colorspace.clone())?;
1801
- } else {
1802
- image_hash.aset("colorspace", ruby.qnil().as_value())?;
1803
- }
1804
- if let Some(bits) = image.bits_per_component {
1805
- image_hash.aset("bits_per_component", bits as i64)?;
1806
- } else {
1807
- image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
1808
- }
1809
- image_hash.aset(
1810
- "is_mask",
1811
- if image.is_mask {
1812
- ruby.qtrue().as_value()
1813
- } else {
1814
- ruby.qfalse().as_value()
1815
- },
1816
- )?;
1817
- if let Some(description) = &image.description {
1818
- image_hash.aset("description", description.clone())?;
1819
- } else {
1820
- image_hash.aset("description", ruby.qnil().as_value())?;
1821
- }
1822
- if let Some(ocr_result) = &image.ocr_result {
1823
- let nested = extraction_result_to_ruby(ruby, (**ocr_result).clone())?;
1824
- image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
1825
- } else {
1826
- image_hash.aset("ocr_result", ruby.qnil().as_value())?;
1827
- }
1828
- images_array.push(image_hash)?;
1829
- }
1830
- page_hash.aset("images", images_array)?;
1831
-
1832
- pages_array.push(page_hash)?;
1833
- }
1834
- set_hash_entry(ruby, &hash, "pages", pages_array.into_value_with(ruby))?;
1835
- } else {
1836
- set_hash_entry(ruby, &hash, "pages", ruby.qnil().as_value())?;
1837
- }
1838
-
1839
1593
  Ok(hash)
1840
1594
  }
1841
1595
 
@@ -1880,14 +1634,13 @@ fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
1880
1634
  ///
1881
1635
  fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
1882
1636
  let ruby = Ruby::get().expect("Ruby not initialized");
1883
- let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
1637
+ let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
1884
1638
  let (data, mime_type) = args.required;
1885
1639
  let opts = Some(args.keywords);
1886
1640
 
1887
1641
  let config = parse_extraction_config(&ruby, opts)?;
1888
1642
 
1889
- let bytes = unsafe { data.as_slice() };
1890
- let result = kreuzberg::extract_bytes_sync(bytes, &mime_type, &config).map_err(kreuzberg_error)?;
1643
+ let result = kreuzberg::extract_bytes_sync(data.as_bytes(), &mime_type, &config).map_err(kreuzberg_error)?;
1891
1644
 
1892
1645
  extraction_result_to_ruby(&ruby, result)
1893
1646
  }
@@ -1961,7 +1714,7 @@ fn extract_file(args: &[Value]) -> Result<RHash, Error> {
1961
1714
  ///
1962
1715
  fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
1963
1716
  let ruby = Ruby::get().expect("Ruby not initialized");
1964
- let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
1717
+ let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
1965
1718
  let (data, mime_type) = args.required;
1966
1719
  let opts = Some(args.keywords);
1967
1720
 
@@ -1970,9 +1723,8 @@ fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
1970
1723
  let runtime =
1971
1724
  tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
1972
1725
 
1973
- let bytes = unsafe { data.as_slice() };
1974
1726
  let result = runtime
1975
- .block_on(async { kreuzberg::extract_bytes(bytes, &mime_type, &config).await })
1727
+ .block_on(async { kreuzberg::extract_bytes(data.as_bytes(), &mime_type, &config).await })
1976
1728
  .map_err(kreuzberg_error)?;
1977
1729
 
1978
1730
  extraction_result_to_ruby(&ruby, result)
@@ -2029,10 +1781,7 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
2029
1781
 
2030
1782
  let config = parse_extraction_config(&ruby, opts)?;
2031
1783
 
2032
- let bytes_vec: Vec<RString> = bytes_array
2033
- .into_iter()
2034
- .map(RString::try_convert)
2035
- .collect::<Result<_, _>>()?;
1784
+ let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
2036
1785
  let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
2037
1786
 
2038
1787
  if bytes_vec.len() != mime_types.len() {
@@ -2043,10 +1792,10 @@ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
2043
1792
  )));
2044
1793
  }
2045
1794
 
2046
- let contents: Vec<(Vec<u8>, String)> = bytes_vec
1795
+ let contents: Vec<(&[u8], &str)> = bytes_vec
2047
1796
  .iter()
2048
1797
  .zip(mime_types.iter())
2049
- .map(|(bytes, mime)| (unsafe { bytes.as_slice() }.to_vec(), mime.clone()))
1798
+ .map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
2050
1799
  .collect();
2051
1800
 
2052
1801
  let results = kreuzberg::batch_extract_bytes_sync(contents, &config).map_err(kreuzberg_error)?;
@@ -2074,10 +1823,7 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
2074
1823
 
2075
1824
  let config = parse_extraction_config(&ruby, opts)?;
2076
1825
 
2077
- let bytes_vec: Vec<RString> = bytes_array
2078
- .into_iter()
2079
- .map(RString::try_convert)
2080
- .collect::<Result<_, _>>()?;
1826
+ let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
2081
1827
  let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
2082
1828
 
2083
1829
  if bytes_vec.len() != mime_types.len() {
@@ -2088,10 +1834,10 @@ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
2088
1834
  )));
2089
1835
  }
2090
1836
 
2091
- let contents: Vec<(Vec<u8>, String)> = bytes_vec
1837
+ let contents: Vec<(&[u8], &str)> = bytes_vec
2092
1838
  .iter()
2093
1839
  .zip(mime_types.iter())
2094
- .map(|(bytes, mime)| (unsafe { bytes.as_slice() }.to_vec(), mime.clone()))
1840
+ .map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
2095
1841
  .collect();
2096
1842
 
2097
1843
  let runtime =
@@ -2251,6 +1997,9 @@ fn register_post_processor(args: &[Value]) -> Result<(), Error> {
2251
1997
  let processor = self.processor.value();
2252
1998
  let result_clone = result.clone();
2253
1999
 
2000
+ // Use block_in_place to avoid GVL deadlocks (same pattern as Python PostProcessor)
2001
+ // See crates/kreuzberg-py/README.md:151-158 for explanation
2002
+ // CRITICAL: spawn_blocking causes GVL deadlocks, must use block_in_place
2254
2003
  let updated_result = tokio::task::block_in_place(|| {
2255
2004
  let ruby = Ruby::get().expect("Ruby not initialized");
2256
2005
  let result_hash = extraction_result_to_ruby(&ruby, result_clone.clone()).map_err(|e| {
@@ -2457,6 +2206,9 @@ fn register_validator(args: &[Value]) -> Result<(), Error> {
2457
2206
  let validator = self.validator.value();
2458
2207
  let result_clone = result.clone();
2459
2208
 
2209
+ // Use block_in_place to avoid GVL deadlocks (same pattern as Python Validator)
2210
+ // See crates/kreuzberg-py/README.md:151-158 for explanation
2211
+ // CRITICAL: spawn_blocking causes GVL deadlocks, must use block_in_place
2460
2212
  tokio::task::block_in_place(|| {
2461
2213
  let ruby = Ruby::get().expect("Ruby not initialized");
2462
2214
  let result_hash =
@@ -2593,7 +2345,6 @@ fn register_ocr_backend(name: String, backend: Value) -> Result<(), Error> {
2593
2345
  detected_languages: None,
2594
2346
  chunks: None,
2595
2347
  images: None,
2596
- pages: None,
2597
2348
  })
2598
2349
  }
2599
2350
 
@@ -2864,7 +2615,6 @@ fn get_extensions_for_mime_native(mime_type: String) -> Result<Vec<String>, Erro
2864
2615
  kreuzberg::get_extensions_for_mime(&mime_type).map_err(kreuzberg_error)
2865
2616
  }
2866
2617
 
2867
- #[cfg(feature = "embeddings")]
2868
2618
  /// List all available embedding preset names.
2869
2619
  ///
2870
2620
  /// Returns an array of preset names that can be used with get_embedding_preset.
@@ -2890,7 +2640,6 @@ fn list_embedding_presets(ruby: &Ruby) -> Result<RArray, Error> {
2890
2640
  Ok(array)
2891
2641
  }
2892
2642
 
2893
- #[cfg(feature = "embeddings")]
2894
2643
  /// Get a specific embedding preset by name.
2895
2644
  ///
2896
2645
  /// Returns a preset configuration hash, or nil if the preset name is not found.
@@ -2931,6 +2680,8 @@ fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
2931
2680
  set_hash_entry(ruby, &hash, "chunk_size", preset.chunk_size.into_value_with(ruby))?;
2932
2681
  set_hash_entry(ruby, &hash, "overlap", preset.overlap.into_value_with(ruby))?;
2933
2682
 
2683
+ // Note: When embeddings feature is enabled in kreuzberg, the model field is EmbeddingModel
2684
+ // Since Ruby bindings typically build with all features, we use the model field and format it.
2934
2685
  let model_name = format!("{:?}", preset.model);
2935
2686
 
2936
2687
  set_hash_entry(ruby, &hash, "model_name", ruby.str_new(&model_name).as_value())?;
@@ -2943,562 +2694,6 @@ fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
2943
2694
  }
2944
2695
  }
2945
2696
 
2946
- /// Get the last error code from FFI
2947
- ///
2948
- /// Returns an i32 error code indicating the type of error that occurred:
2949
- /// - 0: Success (no error)
2950
- /// - 1: GenericError
2951
- /// - 2: Panic
2952
- /// - 3: InvalidArgument
2953
- /// - 4: IoError
2954
- /// - 5: ParsingError
2955
- /// - 6: OcrError
2956
- /// - 7: MissingDependency
2957
- ///
2958
- /// @return [Integer] The error code
2959
- fn last_error_code() -> i32 {
2960
- get_error_code()
2961
- }
2962
-
2963
- /// Get the last panic context from FFI as a JSON string
2964
- ///
2965
- /// Returns a JSON string containing panic context if the last error was a panic,
2966
- /// or nil if no panic context is available.
2967
- ///
2968
- /// The JSON structure contains:
2969
- /// - file: Source file where panic occurred
2970
- /// - line: Line number
2971
- /// - function: Function name
2972
- /// - message: Panic message
2973
- /// - timestamp_secs: Unix timestamp
2974
- ///
2975
- /// @return [String, nil] JSON string with panic context or nil
2976
- fn last_panic_context_json(ruby: &Ruby) -> Value {
2977
- match get_panic_context() {
2978
- Some(json) => ruby.str_new(&json).as_value(),
2979
- None => ruby.qnil().as_value(),
2980
- }
2981
- }
2982
-
2983
- /// Validates a binarization method string
2984
- ///
2985
- /// @param method [String] The binarization method (e.g., "otsu", "adaptive", "sauvola")
2986
- /// @return [Integer] 1 if valid, 0 if invalid (error message available via Kreuzberg::_last_error_code_native)
2987
- fn validate_binarization_method(method: String) -> Result<i32, Error> {
2988
- let c_method = std::ffi::CString::new(method).map_err(|_| runtime_error("Invalid method string"))?;
2989
-
2990
- Ok(unsafe { kreuzberg_validate_binarization_method(c_method.as_ptr()) })
2991
- }
2992
-
2993
- /// Validates an OCR backend string
2994
- ///
2995
- /// @param backend [String] The OCR backend (e.g., "tesseract", "easyocr", "paddleocr")
2996
- /// @return [Integer] 1 if valid, 0 if invalid
2997
- fn validate_ocr_backend(backend: String) -> Result<i32, Error> {
2998
- let c_backend = std::ffi::CString::new(backend).map_err(|_| runtime_error("Invalid backend string"))?;
2999
-
3000
- Ok(unsafe { kreuzberg_validate_ocr_backend(c_backend.as_ptr()) })
3001
- }
3002
-
3003
- /// Validates a language code (ISO 639-1 or 639-3)
3004
- ///
3005
- /// @param code [String] The language code (e.g., "en", "eng", "de", "deu")
3006
- /// @return [Integer] 1 if valid, 0 if invalid
3007
- fn validate_language_code(code: String) -> Result<i32, Error> {
3008
- let c_code = std::ffi::CString::new(code).map_err(|_| runtime_error("Invalid language code string"))?;
3009
-
3010
- Ok(unsafe { kreuzberg_validate_language_code(c_code.as_ptr()) })
3011
- }
3012
-
3013
- /// Validates a token reduction level
3014
- ///
3015
- /// @param level [String] The token reduction level (e.g., "off", "light", "moderate", "aggressive", "maximum")
3016
- /// @return [Integer] 1 if valid, 0 if invalid
3017
- fn validate_token_reduction_level(level: String) -> Result<i32, Error> {
3018
- let c_level = std::ffi::CString::new(level).map_err(|_| runtime_error("Invalid token reduction level string"))?;
3019
-
3020
- Ok(unsafe { kreuzberg_validate_token_reduction_level(c_level.as_ptr()) })
3021
- }
3022
-
3023
- /// Validates a tesseract PSM (Page Segmentation Mode) value
3024
- ///
3025
- /// @param psm [Integer] The PSM value (0-13)
3026
- /// @return [Integer] 1 if valid, 0 if invalid
3027
- fn validate_tesseract_psm(psm: i32) -> Result<i32, Error> {
3028
- Ok(kreuzberg_validate_tesseract_psm(psm))
3029
- }
3030
-
3031
- /// Validates a tesseract OEM (OCR Engine Mode) value
3032
- ///
3033
- /// @param oem [Integer] The OEM value (0-3)
3034
- /// @return [Integer] 1 if valid, 0 if invalid
3035
- fn validate_tesseract_oem(oem: i32) -> Result<i32, Error> {
3036
- Ok(kreuzberg_validate_tesseract_oem(oem))
3037
- }
3038
-
3039
- /// Validates an output format string
3040
- ///
3041
- /// @param format [String] The output format (e.g., "text", "markdown")
3042
- /// @return [Integer] 1 if valid, 0 if invalid
3043
- fn validate_output_format(format: String) -> Result<i32, Error> {
3044
- let c_format = std::ffi::CString::new(format).map_err(|_| runtime_error("Invalid format string"))?;
3045
-
3046
- Ok(unsafe { kreuzberg_validate_output_format(c_format.as_ptr()) })
3047
- }
3048
-
3049
- /// Validates a confidence threshold value
3050
- ///
3051
- /// @param confidence [Float] The confidence value (0.0-1.0)
3052
- /// @return [Integer] 1 if valid, 0 if invalid
3053
- fn validate_confidence(confidence: f64) -> Result<i32, Error> {
3054
- Ok(kreuzberg_validate_confidence(confidence))
3055
- }
3056
-
3057
- /// Validates a DPI (dots per inch) value
3058
- ///
3059
- /// @param dpi [Integer] The DPI value
3060
- /// @return [Integer] 1 if valid, 0 if invalid
3061
- fn validate_dpi(dpi: i32) -> Result<i32, Error> {
3062
- Ok(kreuzberg_validate_dpi(dpi))
3063
- }
3064
-
3065
- /// Validates chunking parameters
3066
- ///
3067
- /// @param max_chars [Integer] Maximum characters per chunk
3068
- /// @param max_overlap [Integer] Maximum overlap between chunks
3069
- /// @return [Integer] 1 if valid, 0 if invalid
3070
- fn validate_chunking_params(max_chars: usize, max_overlap: usize) -> Result<i32, Error> {
3071
- Ok(kreuzberg_validate_chunking_params(max_chars, max_overlap))
3072
- }
3073
-
3074
- /// Gets valid binarization methods as a JSON string
3075
- ///
3076
- /// @return [String] JSON array of valid binarization methods
3077
- fn get_valid_binarization_methods(_ruby: &Ruby) -> Result<String, Error> {
3078
- let ptr = kreuzberg_get_valid_binarization_methods();
3079
- if ptr.is_null() {
3080
- return Err(runtime_error("Failed to get valid binarization methods"));
3081
- }
3082
-
3083
- let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
3084
- let result = c_str
3085
- .to_str()
3086
- .map_err(|_| runtime_error("Invalid UTF-8 in binarization methods"))?
3087
- .to_string();
3088
-
3089
- unsafe {
3090
- kreuzberg_free_string(ptr as *mut c_char);
3091
- }
3092
-
3093
- Ok(result)
3094
- }
3095
-
3096
- /// Gets valid language codes as a JSON string
3097
- ///
3098
- /// @return [String] JSON array of valid language codes
3099
- fn get_valid_language_codes(_ruby: &Ruby) -> Result<String, Error> {
3100
- let ptr = kreuzberg_get_valid_language_codes();
3101
- if ptr.is_null() {
3102
- return Err(runtime_error("Failed to get valid language codes"));
3103
- }
3104
-
3105
- let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
3106
- let result = c_str
3107
- .to_str()
3108
- .map_err(|_| runtime_error("Invalid UTF-8 in language codes"))?
3109
- .to_string();
3110
-
3111
- unsafe {
3112
- kreuzberg_free_string(ptr as *mut c_char);
3113
- }
3114
-
3115
- Ok(result)
3116
- }
3117
-
3118
- /// Gets valid OCR backends as a JSON string
3119
- ///
3120
- /// @return [String] JSON array of valid OCR backends
3121
- fn get_valid_ocr_backends(_ruby: &Ruby) -> Result<String, Error> {
3122
- let ptr = kreuzberg_get_valid_ocr_backends();
3123
- if ptr.is_null() {
3124
- return Err(runtime_error("Failed to get valid OCR backends"));
3125
- }
3126
-
3127
- let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
3128
- let result = c_str
3129
- .to_str()
3130
- .map_err(|_| runtime_error("Invalid UTF-8 in OCR backends"))?
3131
- .to_string();
3132
-
3133
- unsafe {
3134
- kreuzberg_free_string(ptr as *mut c_char);
3135
- }
3136
-
3137
- Ok(result)
3138
- }
3139
-
3140
- /// Gets valid token reduction levels as a JSON string
3141
- ///
3142
- /// @return [String] JSON array of valid token reduction levels
3143
- fn get_valid_token_reduction_levels(_ruby: &Ruby) -> Result<String, Error> {
3144
- let ptr = kreuzberg_get_valid_token_reduction_levels();
3145
- if ptr.is_null() {
3146
- return Err(runtime_error("Failed to get valid token reduction levels"));
3147
- }
3148
-
3149
- let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
3150
- let result = c_str
3151
- .to_str()
3152
- .map_err(|_| runtime_error("Invalid UTF-8 in token reduction levels"))?
3153
- .to_string();
3154
-
3155
- unsafe {
3156
- kreuzberg_free_string(ptr as *mut c_char);
3157
- }
3158
-
3159
- Ok(result)
3160
- }
3161
-
3162
- /// Serialize a config to JSON string
3163
- /// @param config_json [String] JSON string representing the config
3164
- /// @return [String] Serialized JSON config
3165
- fn config_to_json_wrapper(_ruby: &Ruby, config_json: String) -> Result<String, Error> {
3166
- let c_json =
3167
- std::ffi::CString::new(config_json).map_err(|e| runtime_error(format!("Invalid config JSON: {}", e)))?;
3168
-
3169
- let config_ptr = unsafe { kreuzberg_config_from_json(c_json.as_ptr()) };
3170
- if config_ptr.is_null() {
3171
- return Err(runtime_error("Failed to parse config from JSON"));
3172
- }
3173
-
3174
- let json_ptr = unsafe { kreuzberg_config_to_json(config_ptr) };
3175
- let result = if json_ptr.is_null() {
3176
- Err(runtime_error("Failed to serialize config to JSON"))
3177
- } else {
3178
- let c_str = unsafe { std::ffi::CStr::from_ptr(json_ptr) };
3179
- let json = c_str
3180
- .to_str()
3181
- .map_err(|_| runtime_error("Invalid UTF-8 in serialized config"))?
3182
- .to_string();
3183
- unsafe {
3184
- kreuzberg_free_string(json_ptr as *mut c_char);
3185
- }
3186
- Ok(json)
3187
- };
3188
-
3189
- unsafe {
3190
- kreuzberg_config_free(config_ptr);
3191
- }
3192
- result
3193
- }
3194
-
3195
- /// Get a field from config
3196
- /// @param config_json [String] JSON string representing the config
3197
- /// @param field_name [String] Field name (supports dot notation)
3198
- /// @return [Object] Parsed JSON value, or nil if field doesn't exist
3199
- fn config_get_field_wrapper(ruby: &Ruby, config_json: String, field_name: String) -> Result<Value, Error> {
3200
- let c_json =
3201
- std::ffi::CString::new(config_json).map_err(|e| runtime_error(format!("Invalid config JSON: {}", e)))?;
3202
- let c_field =
3203
- std::ffi::CString::new(field_name).map_err(|e| runtime_error(format!("Invalid field name: {}", e)))?;
3204
-
3205
- let config_ptr = unsafe { kreuzberg_config_from_json(c_json.as_ptr()) };
3206
- if config_ptr.is_null() {
3207
- return Err(runtime_error("Failed to parse config from JSON"));
3208
- }
3209
-
3210
- let field_ptr = unsafe { kreuzberg_config_get_field(config_ptr, c_field.as_ptr()) };
3211
- let result = if field_ptr.is_null() {
3212
- Ok(ruby.qnil().as_value())
3213
- } else {
3214
- let c_str = unsafe { std::ffi::CStr::from_ptr(field_ptr) };
3215
- let json_str = c_str
3216
- .to_str()
3217
- .map_err(|_| runtime_error("Invalid UTF-8 in field value"))?;
3218
- let json_value: serde_json::Value =
3219
- serde_json::from_str(json_str).map_err(|e| runtime_error(format!("Failed to parse field value: {}", e)))?;
3220
- unsafe {
3221
- kreuzberg_free_string(field_ptr as *mut c_char);
3222
- }
3223
- json_value_to_ruby(ruby, &json_value)
3224
- };
3225
-
3226
- unsafe {
3227
- kreuzberg_config_free(config_ptr);
3228
- }
3229
- result
3230
- }
3231
-
3232
- /// Merge two configs
3233
- /// @param base_json [String] Base config JSON
3234
- /// @param override_json [String] Override config JSON
3235
- /// @return [String] Merged config JSON
3236
- fn config_merge_wrapper(_ruby: &Ruby, base_json: String, override_json: String) -> Result<String, Error> {
3237
- let c_base =
3238
- std::ffi::CString::new(base_json).map_err(|e| runtime_error(format!("Invalid base config JSON: {}", e)))?;
3239
- let c_override = std::ffi::CString::new(override_json)
3240
- .map_err(|e| runtime_error(format!("Invalid override config JSON: {}", e)))?;
3241
-
3242
- let base_ptr = unsafe { kreuzberg_config_from_json(c_base.as_ptr()) };
3243
- if base_ptr.is_null() {
3244
- return Err(runtime_error("Failed to parse base config from JSON"));
3245
- }
3246
-
3247
- let override_ptr = unsafe { kreuzberg_config_from_json(c_override.as_ptr()) };
3248
- if override_ptr.is_null() {
3249
- unsafe {
3250
- kreuzberg_config_free(base_ptr);
3251
- }
3252
- return Err(runtime_error("Failed to parse override config from JSON"));
3253
- }
3254
-
3255
- let merge_result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
3256
-
3257
- let result = if merge_result == 0 {
3258
- Err(runtime_error("Failed to merge configs"))
3259
- } else {
3260
- let json_ptr = unsafe { kreuzberg_config_to_json(base_ptr) };
3261
- if json_ptr.is_null() {
3262
- Err(runtime_error("Failed to serialize merged config"))
3263
- } else {
3264
- let c_str = unsafe { std::ffi::CStr::from_ptr(json_ptr) };
3265
- let json = c_str
3266
- .to_str()
3267
- .map_err(|_| runtime_error("Invalid UTF-8 in merged config"))?
3268
- .to_string();
3269
- unsafe {
3270
- kreuzberg_free_string(json_ptr as *mut c_char);
3271
- }
3272
- Ok(json)
3273
- }
3274
- };
3275
-
3276
- unsafe {
3277
- kreuzberg_config_free(base_ptr);
3278
- kreuzberg_config_free(override_ptr);
3279
- }
3280
- result
3281
- }
3282
-
3283
- /// Get page count from result
3284
- /// @param result_ptr [Integer] Opaque pointer to ExtractionResult (as integer)
3285
- /// @return [Integer] Page count, or -1 on error
3286
- fn result_page_count(_ruby: &Ruby, result_ptr: i64) -> Result<i32, Error> {
3287
- if result_ptr == 0 {
3288
- return Err(runtime_error("Invalid result pointer"));
3289
- }
3290
-
3291
- let page_count = unsafe { kreuzberg_result_get_page_count(result_ptr as *const RustExtractionResult) };
3292
-
3293
- Ok(page_count)
3294
- }
3295
-
3296
- /// Get chunk count from result
3297
- /// @param result_ptr [Integer] Opaque pointer to ExtractionResult (as integer)
3298
- /// @return [Integer] Chunk count, or -1 on error
3299
- fn result_chunk_count(_ruby: &Ruby, result_ptr: i64) -> Result<i32, Error> {
3300
- if result_ptr == 0 {
3301
- return Err(runtime_error("Invalid result pointer"));
3302
- }
3303
-
3304
- let chunk_count = unsafe { kreuzberg_result_get_chunk_count(result_ptr as *const RustExtractionResult) };
3305
-
3306
- Ok(chunk_count)
3307
- }
3308
-
3309
- /// Get detected language from result
3310
- /// @param result_ptr [Integer] Opaque pointer to ExtractionResult (as integer)
3311
- /// @return [String, nil] Detected language code, or nil if not detected
3312
- fn result_detected_language(_ruby: &Ruby, result_ptr: i64) -> Result<Value, Error> {
3313
- if result_ptr == 0 {
3314
- return Err(runtime_error("Invalid result pointer"));
3315
- }
3316
-
3317
- let lang_ptr = unsafe { kreuzberg_result_get_detected_language(result_ptr as *const RustExtractionResult) };
3318
-
3319
- if lang_ptr.is_null() {
3320
- return Ok(_ruby.qnil().as_value());
3321
- }
3322
-
3323
- let c_str = unsafe { std::ffi::CStr::from_ptr(lang_ptr) };
3324
- let lang = c_str
3325
- .to_str()
3326
- .map_err(|_| runtime_error("Invalid UTF-8 in detected language"))?
3327
- .to_string();
3328
-
3329
- unsafe {
3330
- kreuzberg_free_string(lang_ptr as *mut c_char);
3331
- }
3332
-
3333
- Ok(_ruby.str_new(&lang).into_value_with(_ruby))
3334
- }
3335
-
3336
- /// Get metadata field from result
3337
- /// @param result_ptr [Integer] Opaque pointer to ExtractionResult (as integer)
3338
- /// @param field_name [String] Field name (supports dot notation)
3339
- /// @return [Object, nil] Parsed JSON value, or nil if field doesn't exist
3340
- fn result_metadata_field(ruby: &Ruby, result_ptr: i64, field_name: String) -> Result<Value, Error> {
3341
- if result_ptr == 0 {
3342
- return Err(runtime_error("Invalid result pointer"));
3343
- }
3344
-
3345
- let c_field =
3346
- std::ffi::CString::new(field_name).map_err(|e| runtime_error(format!("Invalid field name: {}", e)))?;
3347
-
3348
- let field = unsafe { kreuzberg_result_get_metadata_field(result_ptr as *const RustExtractionResult, c_field.as_ptr()) };
3349
-
3350
- if field.is_null != 0 {
3351
- return Ok(ruby.qnil().as_value());
3352
- }
3353
-
3354
- if field.json_value.is_null() {
3355
- return Ok(ruby.qnil().as_value());
3356
- }
3357
-
3358
- let c_str = unsafe { std::ffi::CStr::from_ptr(field.json_value) };
3359
- let json_str = c_str
3360
- .to_str()
3361
- .map_err(|_| runtime_error("Invalid UTF-8 in field value"))?;
3362
- let json_value: serde_json::Value =
3363
- serde_json::from_str(json_str).map_err(|e| runtime_error(format!("Failed to parse field value: {}", e)))?;
3364
-
3365
- unsafe {
3366
- kreuzberg_free_string(field.json_value);
3367
- }
3368
-
3369
- json_value_to_ruby(ruby, &json_value)
3370
- }
3371
-
3372
- /// Get structured error details from FFI
3373
- /// @return [Hash] Error details with keys: :message, :error_code, :error_type, :source_file, :source_function, :source_line, :context_info, :is_panic
3374
- fn get_error_details_native(ruby: &Ruby) -> Result<Value, Error> {
3375
- let details = kreuzberg_get_error_details();
3376
-
3377
- let hash = ruby.hash_new();
3378
-
3379
- unsafe {
3380
- let message = if !details.message.is_null() {
3381
- let c_str = std::ffi::CStr::from_ptr(details.message);
3382
- let msg = c_str.to_str().unwrap_or("").to_string();
3383
- kreuzberg_free_string(details.message);
3384
- msg
3385
- } else {
3386
- String::new()
3387
- };
3388
-
3389
- let error_type = if !details.error_type.is_null() {
3390
- let c_str = std::ffi::CStr::from_ptr(details.error_type);
3391
- let ty = c_str.to_str().unwrap_or("unknown").to_string();
3392
- kreuzberg_free_string(details.error_type);
3393
- ty
3394
- } else {
3395
- "unknown".to_string()
3396
- };
3397
-
3398
- let source_file = if !details.source_file.is_null() {
3399
- let c_str = std::ffi::CStr::from_ptr(details.source_file);
3400
- let file = c_str.to_str().ok().map(|s| s.to_string());
3401
- kreuzberg_free_string(details.source_file);
3402
- file
3403
- } else {
3404
- None
3405
- };
3406
-
3407
- let source_function = if !details.source_function.is_null() {
3408
- let c_str = std::ffi::CStr::from_ptr(details.source_function);
3409
- let func = c_str.to_str().ok().map(|s| s.to_string());
3410
- kreuzberg_free_string(details.source_function);
3411
- func
3412
- } else {
3413
- None
3414
- };
3415
-
3416
- let context_info = if !details.context_info.is_null() {
3417
- let c_str = std::ffi::CStr::from_ptr(details.context_info);
3418
- let ctx = c_str.to_str().ok().map(|s| s.to_string());
3419
- kreuzberg_free_string(details.context_info);
3420
- ctx
3421
- } else {
3422
- None
3423
- };
3424
-
3425
- hash.aset(ruby.to_symbol("message"), ruby.str_new(&message).as_value())?;
3426
- hash.aset(ruby.to_symbol("error_code"), details.error_code.into_value_with(ruby))?;
3427
- hash.aset(ruby.to_symbol("error_type"), ruby.str_new(&error_type).as_value())?;
3428
-
3429
- if let Some(file) = source_file {
3430
- hash.aset(ruby.to_symbol("source_file"), ruby.str_new(&file).as_value())?;
3431
- } else {
3432
- hash.aset(ruby.to_symbol("source_file"), ruby.qnil().as_value())?;
3433
- }
3434
-
3435
- if let Some(func) = source_function {
3436
- hash.aset(ruby.to_symbol("source_function"), ruby.str_new(&func).as_value())?;
3437
- } else {
3438
- hash.aset(ruby.to_symbol("source_function"), ruby.qnil().as_value())?;
3439
- }
3440
-
3441
- hash.aset(ruby.to_symbol("source_line"), details.source_line.into_value_with(ruby))?;
3442
-
3443
- if let Some(ctx) = context_info {
3444
- hash.aset(ruby.to_symbol("context_info"), ruby.str_new(&ctx).as_value())?;
3445
- } else {
3446
- hash.aset(ruby.to_symbol("context_info"), ruby.qnil().as_value())?;
3447
- }
3448
-
3449
- hash.aset(
3450
- ruby.to_symbol("is_panic"),
3451
- (details.is_panic != 0).into_value_with(ruby),
3452
- )?;
3453
- }
3454
-
3455
- Ok(hash.into_value_with(ruby))
3456
- }
3457
-
3458
- /// Classify an error based on an error message string
3459
- /// @param message [String] The error message to classify
3460
- /// @return [Integer] Error code (0-7)
3461
- fn classify_error_native(ruby: &Ruby, message: String) -> Result<Value, Error> {
3462
- let c_message =
3463
- std::ffi::CString::new(message).map_err(|e| runtime_error(format!("Invalid error message: {}", e)))?;
3464
-
3465
- let code = unsafe { kreuzberg_classify_error(c_message.as_ptr()) };
3466
-
3467
- Ok(code.into_value_with(ruby))
3468
- }
3469
-
3470
- /// Get the human-readable name of an error code
3471
- /// @param code [Integer] Numeric error code (0-7)
3472
- /// @return [String] Human-readable error code name
3473
- fn error_code_name_native(ruby: &Ruby, code: u32) -> Result<Value, Error> {
3474
- let name_ptr = kreuzberg_error_code_name(code);
3475
-
3476
- if name_ptr.is_null() {
3477
- return Ok(ruby.str_new("unknown").as_value());
3478
- }
3479
-
3480
- let c_str = unsafe { std::ffi::CStr::from_ptr(name_ptr) };
3481
- let name = c_str.to_str().unwrap_or("unknown").to_string();
3482
-
3483
- Ok(ruby.str_new(&name).as_value())
3484
- }
3485
-
3486
- /// Get the description of an error code
3487
- /// @param code [Integer] Numeric error code (0-7)
3488
- /// @return [String] Description of the error code
3489
- fn error_code_description_native(ruby: &Ruby, code: u32) -> Result<Value, Error> {
3490
- let desc_ptr = kreuzberg_error_code_description(code);
3491
-
3492
- if desc_ptr.is_null() {
3493
- return Ok(ruby.str_new("Unknown error code").as_value());
3494
- }
3495
-
3496
- let c_str = unsafe { std::ffi::CStr::from_ptr(desc_ptr) };
3497
- let desc = c_str.to_str().unwrap_or("Unknown error code").to_string();
3498
-
3499
- Ok(ruby.str_new(&desc).as_value())
3500
- }
3501
-
3502
2697
  /// Initialize the Kreuzberg Ruby module
3503
2698
  #[magnus::init]
3504
2699
  fn init(ruby: &Ruby) -> Result<(), Error> {
@@ -3547,66 +2742,8 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
3547
2742
  module.define_module_function("get_extensions_for_mime", function!(get_extensions_for_mime_native, 1))?;
3548
2743
  module.define_module_function("validate_mime_type", function!(validate_mime_type_native, 1))?;
3549
2744
 
3550
- #[cfg(feature = "embeddings")]
3551
- {
3552
- module.define_module_function("list_embedding_presets", function!(list_embedding_presets, 0))?;
3553
- module.define_module_function("get_embedding_preset", function!(get_embedding_preset, 1))?;
3554
- }
3555
-
3556
- module.define_module_function("_last_error_code_native", function!(last_error_code, 0))?;
3557
- module.define_module_function("_last_panic_context_json_native", function!(last_panic_context_json, 0))?;
3558
-
3559
- module.define_module_function(
3560
- "_validate_binarization_method_native",
3561
- function!(validate_binarization_method, 1),
3562
- )?;
3563
- module.define_module_function("_validate_ocr_backend_native", function!(validate_ocr_backend, 1))?;
3564
- module.define_module_function("_validate_language_code_native", function!(validate_language_code, 1))?;
3565
- module.define_module_function(
3566
- "_validate_token_reduction_level_native",
3567
- function!(validate_token_reduction_level, 1),
3568
- )?;
3569
- module.define_module_function("_validate_tesseract_psm_native", function!(validate_tesseract_psm, 1))?;
3570
- module.define_module_function("_validate_tesseract_oem_native", function!(validate_tesseract_oem, 1))?;
3571
- module.define_module_function("_validate_output_format_native", function!(validate_output_format, 1))?;
3572
- module.define_module_function("_validate_confidence_native", function!(validate_confidence, 1))?;
3573
- module.define_module_function("_validate_dpi_native", function!(validate_dpi, 1))?;
3574
- module.define_module_function(
3575
- "_validate_chunking_params_native",
3576
- function!(validate_chunking_params, 2),
3577
- )?;
3578
- module.define_module_function(
3579
- "_get_valid_binarization_methods_native",
3580
- function!(get_valid_binarization_methods, 0),
3581
- )?;
3582
- module.define_module_function(
3583
- "_get_valid_language_codes_native",
3584
- function!(get_valid_language_codes, 0),
3585
- )?;
3586
- module.define_module_function("_get_valid_ocr_backends_native", function!(get_valid_ocr_backends, 0))?;
3587
- module.define_module_function(
3588
- "_get_valid_token_reduction_levels_native",
3589
- function!(get_valid_token_reduction_levels, 0),
3590
- )?;
3591
-
3592
- module.define_module_function("_config_to_json_native", function!(config_to_json_wrapper, 1))?;
3593
- module.define_module_function("_config_get_field_native", function!(config_get_field_wrapper, 2))?;
3594
- module.define_module_function("_config_merge_native", function!(config_merge_wrapper, 2))?;
3595
- module.define_module_function("_result_page_count_native", function!(result_page_count, 1))?;
3596
- module.define_module_function("_result_chunk_count_native", function!(result_chunk_count, 1))?;
3597
- module.define_module_function(
3598
- "_result_detected_language_native",
3599
- function!(result_detected_language, 1),
3600
- )?;
3601
- module.define_module_function("_result_metadata_field_native", function!(result_metadata_field, 2))?;
3602
-
3603
- module.define_module_function("_get_error_details_native", function!(get_error_details_native, 0))?;
3604
- module.define_module_function("_classify_error_native", function!(classify_error_native, 1))?;
3605
- module.define_module_function("_error_code_name_native", function!(error_code_name_native, 1))?;
3606
- module.define_module_function(
3607
- "_error_code_description_native",
3608
- function!(error_code_description_native, 1),
3609
- )?;
2745
+ module.define_module_function("list_embedding_presets", function!(list_embedding_presets, 0))?;
2746
+ module.define_module_function("get_embedding_preset", function!(get_embedding_preset, 1))?;
3610
2747
 
3611
2748
  Ok(())
3612
2749
  }