kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (370) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +538 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +4 -104
  8. data/README.md +454 -432
  9. data/Rakefile +25 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -341
  12. data/ext/kreuzberg_rb/extconf.rb +45 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
  14. data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6721
  15. data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3135
  23. data/extconf.rb +28 -28
  24. data/kreuzberg.gemspec +214 -182
  25. data/lib/kreuzberg/api_proxy.rb +142 -142
  26. data/lib/kreuzberg/cache_api.rb +81 -46
  27. data/lib/kreuzberg/cli.rb +55 -55
  28. data/lib/kreuzberg/cli_proxy.rb +127 -127
  29. data/lib/kreuzberg/config.rb +724 -724
  30. data/lib/kreuzberg/error_context.rb +80 -32
  31. data/lib/kreuzberg/errors.rb +118 -118
  32. data/lib/kreuzberg/extraction_api.rb +340 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  36. data/lib/kreuzberg/result.rb +279 -279
  37. data/lib/kreuzberg/setup_lib_path.rb +80 -80
  38. data/lib/kreuzberg/validator_protocol.rb +89 -89
  39. data/lib/kreuzberg/version.rb +5 -5
  40. data/lib/kreuzberg.rb +109 -103
  41. data/lib/pdfium.dll +0 -0
  42. data/sig/kreuzberg/internal.rbs +184 -184
  43. data/sig/kreuzberg.rbs +546 -537
  44. data/spec/binding/cache_spec.rb +227 -227
  45. data/spec/binding/cli_proxy_spec.rb +85 -85
  46. data/spec/binding/cli_spec.rb +55 -55
  47. data/spec/binding/config_spec.rb +345 -345
  48. data/spec/binding/config_validation_spec.rb +283 -283
  49. data/spec/binding/error_handling_spec.rb +213 -213
  50. data/spec/binding/errors_spec.rb +66 -66
  51. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  52. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  53. data/spec/binding/plugins/validator_spec.rb +274 -274
  54. data/spec/fixtures/config.toml +39 -39
  55. data/spec/fixtures/config.yaml +41 -41
  56. data/spec/fixtures/invalid_config.toml +4 -4
  57. data/spec/smoke/package_spec.rb +178 -178
  58. data/spec/spec_helper.rb +42 -42
  59. data/vendor/Cargo.toml +45 -0
  60. data/vendor/kreuzberg/Cargo.toml +61 -38
  61. data/vendor/kreuzberg/README.md +230 -221
  62. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
  63. data/vendor/kreuzberg/build.rs +843 -891
  64. data/vendor/kreuzberg/src/api/error.rs +81 -81
  65. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  66. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  67. data/vendor/kreuzberg/src/api/server.rs +353 -353
  68. data/vendor/kreuzberg/src/api/types.rs +170 -170
  69. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  70. data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
  71. data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
  72. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  73. data/vendor/kreuzberg/src/core/config.rs +1080 -1080
  74. data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
  75. data/vendor/kreuzberg/src/core/io.rs +329 -329
  76. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  77. data/vendor/kreuzberg/src/core/mod.rs +47 -47
  78. data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1171
  79. data/vendor/kreuzberg/src/embeddings.rs +500 -432
  80. data/vendor/kreuzberg/src/error.rs +431 -431
  81. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  82. data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
  83. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  84. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  85. data/vendor/kreuzberg/src/extraction/html.rs +601 -569
  86. data/vendor/kreuzberg/src/extraction/image.rs +491 -491
  87. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
  88. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
  89. data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
  90. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  91. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  92. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  93. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  94. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
  95. data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
  96. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  97. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  98. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  99. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  100. data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
  101. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
  102. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
  103. data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
  104. data/vendor/kreuzberg/src/extractors/email.rs +157 -157
  105. data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
  106. data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
  107. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
  108. data/vendor/kreuzberg/src/extractors/html.rs +407 -407
  109. data/vendor/kreuzberg/src/extractors/image.rs +219 -219
  110. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
  112. data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
  114. data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
  115. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  116. data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
  117. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
  118. data/vendor/kreuzberg/src/extractors/pdf.rs +749 -673
  119. data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
  120. data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
  121. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
  122. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  123. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  124. data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
  125. data/vendor/kreuzberg/src/extractors/text.rs +265 -265
  126. data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
  127. data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
  128. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  129. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  130. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  131. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  132. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  133. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  134. data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
  135. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  136. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  137. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  138. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
  139. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
  140. data/vendor/kreuzberg/src/lib.rs +113 -113
  141. data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
  142. data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
  143. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  144. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  145. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  146. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  147. data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
  148. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  149. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  150. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
  151. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  152. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  153. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  154. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  155. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  156. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -328
  157. data/vendor/kreuzberg/src/pdf/error.rs +130 -130
  158. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  159. data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
  160. data/vendor/kreuzberg/src/pdf/mod.rs +68 -66
  161. data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
  162. data/vendor/kreuzberg/src/pdf/table.rs +420 -417
  163. data/vendor/kreuzberg/src/pdf/text.rs +240 -240
  164. data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
  165. data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
  166. data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
  167. data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
  168. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
  169. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  170. data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
  171. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  172. data/vendor/kreuzberg/src/text/mod.rs +25 -25
  173. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  174. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
  175. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  176. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  177. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  178. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  179. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  180. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  181. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  182. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  183. data/vendor/kreuzberg/src/types.rs +1055 -1055
  184. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  185. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  186. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  187. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  188. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  189. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  190. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  191. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  192. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  193. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  194. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  195. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  196. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  198. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  199. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  200. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  201. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  202. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  203. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  204. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  205. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  206. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  207. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  208. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  209. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  210. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  211. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  212. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  213. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  214. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  215. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  216. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  217. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  218. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  219. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  220. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  221. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  222. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  223. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  224. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  225. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  226. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  227. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  228. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  229. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  230. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  231. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  232. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  233. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  234. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  235. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  236. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  237. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  238. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  239. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  240. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  241. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  242. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  243. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  244. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  245. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  246. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  247. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  248. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  249. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  250. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  251. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  252. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  253. data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
  254. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
  255. data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
  256. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  257. data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
  258. data/vendor/kreuzberg/tests/config_features.rs +612 -612
  259. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
  260. data/vendor/kreuzberg/tests/core_integration.rs +510 -510
  261. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  262. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
  263. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  264. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  265. data/vendor/kreuzberg/tests/email_integration.rs +327 -327
  266. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  267. data/vendor/kreuzberg/tests/error_handling.rs +402 -402
  268. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  269. data/vendor/kreuzberg/tests/format_integration.rs +164 -161
  270. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  271. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  272. data/vendor/kreuzberg/tests/image_integration.rs +255 -255
  273. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  274. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  275. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  276. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  277. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  278. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  279. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  280. data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
  281. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
  282. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
  283. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
  284. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  285. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
  286. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  287. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  288. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
  289. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
  290. data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
  291. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
  292. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
  293. data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
  294. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  295. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
  296. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
  297. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
  298. data/vendor/kreuzberg/tests/security_validation.rs +416 -416
  299. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  300. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
  301. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
  302. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
  303. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  304. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  305. data/vendor/kreuzberg-ffi/README.md +851 -0
  306. data/vendor/kreuzberg-ffi/build.rs +176 -0
  307. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  308. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  309. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  310. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  311. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  312. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  313. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  314. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  315. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  316. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  317. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  318. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  319. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  320. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  321. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  322. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  323. data/vendor/kreuzberg-tesseract/README.md +399 -0
  324. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  325. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  326. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  327. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  328. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  329. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  330. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  331. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  332. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  333. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  334. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  335. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  336. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  337. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  338. data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
  339. data/vendor/rb-sys/Cargo.lock +393 -393
  340. data/vendor/rb-sys/Cargo.toml +70 -70
  341. data/vendor/rb-sys/Cargo.toml.orig +57 -57
  342. data/vendor/rb-sys/LICENSE-APACHE +190 -190
  343. data/vendor/rb-sys/LICENSE-MIT +21 -21
  344. data/vendor/rb-sys/build/features.rs +111 -111
  345. data/vendor/rb-sys/build/main.rs +286 -286
  346. data/vendor/rb-sys/build/stable_api_config.rs +155 -155
  347. data/vendor/rb-sys/build/version.rs +50 -50
  348. data/vendor/rb-sys/readme.md +36 -36
  349. data/vendor/rb-sys/src/bindings.rs +21 -21
  350. data/vendor/rb-sys/src/hidden.rs +11 -11
  351. data/vendor/rb-sys/src/lib.rs +35 -35
  352. data/vendor/rb-sys/src/macros.rs +371 -371
  353. data/vendor/rb-sys/src/memory.rs +53 -53
  354. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
  355. data/vendor/rb-sys/src/special_consts.rs +31 -31
  356. data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
  357. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
  358. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
  359. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
  360. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
  361. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
  362. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
  363. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
  364. data/vendor/rb-sys/src/stable_api.rs +260 -260
  365. data/vendor/rb-sys/src/symbol.rs +31 -31
  366. data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
  367. data/vendor/rb-sys/src/utils.rs +89 -89
  368. data/vendor/rb-sys/src/value_type.rs +7 -7
  369. metadata +44 -81
  370. data/vendor/rb-sys/bin/release.sh +0 -21
@@ -0,0 +1,3616 @@
1
+ //! C FFI bindings for Kreuzberg document intelligence library.
2
+ //!
3
+ //! Provides a C-compatible API that can be consumed by Java (Panama FFI),
4
+ //! Go (cgo), C# (P/Invoke), Zig, and other languages with C FFI support.
5
+
6
+ mod panic_shield;
7
+
8
+ pub use panic_shield::{
9
+ ErrorCode, StructuredError, clear_structured_error, get_last_error_code, get_last_error_message,
10
+ get_last_panic_context, set_structured_error,
11
+ };
12
+
13
+ use std::cell::RefCell;
14
+ use std::ffi::{CStr, CString};
15
+ use std::os::raw::c_char;
16
+ use std::path::Path;
17
+ use std::ptr;
18
+ use std::sync::Arc;
19
+
20
+ use async_trait::async_trait;
21
+ use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
22
+ use kreuzberg::plugins::registry::get_ocr_backend_registry;
23
+ use kreuzberg::plugins::{OcrBackend, Plugin, ProcessingStage};
24
+ use kreuzberg::types::ExtractionResult;
25
+ use kreuzberg::{KreuzbergError, Result};
26
+ #[cfg(not(all(windows, target_env = "gnu")))]
27
+ use serde::Serialize;
28
+
29
+ thread_local! {
30
+ static LAST_ERROR_C_STRING: RefCell<Option<CString>> = const { RefCell::new(None) };
31
+ }
32
+
33
+ /// Set the last error message (convenience wrapper for backward compatibility)
34
+ fn set_last_error(err: String) {
35
+ if let Ok(c_str) = CString::new(err.clone()) {
36
+ LAST_ERROR_C_STRING.with(|last| *last.borrow_mut() = Some(c_str));
37
+ }
38
+
39
+ let structured_err = StructuredError::from_message(err, ErrorCode::GenericError);
40
+ set_structured_error(structured_err);
41
+ }
42
+
43
+ /// Clear the last error message
44
+ fn clear_last_error() {
45
+ LAST_ERROR_C_STRING.with(|last| *last.borrow_mut() = None);
46
+ clear_structured_error();
47
+ }
48
+
49
+ fn string_to_c_string(value: String) -> std::result::Result<*mut c_char, String> {
50
+ CString::new(value)
51
+ .map(CString::into_raw)
52
+ .map_err(|e| format!("Failed to create C string: {}", e))
53
+ }
54
+
55
+ type FfiResult<T> = std::result::Result<T, String>;
56
+
57
+ fn parse_extraction_config_from_json(config_str: &str) -> FfiResult<ExtractionConfig> {
58
+ use html_to_markdown_rs::options::{
59
+ CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
60
+ PreprocessingPreset, WhitespaceMode,
61
+ };
62
+
63
+ fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
64
+ where
65
+ F: Fn(&str) -> std::result::Result<T, String>,
66
+ {
67
+ if let Some(raw) = value {
68
+ let text = raw
69
+ .as_str()
70
+ .ok_or_else(|| "Expected string for html_options enum field".to_string())?;
71
+ return parse_fn(text).map(Some);
72
+ }
73
+ Ok(None)
74
+ }
75
+
76
+ fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
77
+ match value.to_lowercase().as_str() {
78
+ "atx" => Ok(HeadingStyle::Atx),
79
+ "underlined" => Ok(HeadingStyle::Underlined),
80
+ "atx_closed" => Ok(HeadingStyle::AtxClosed),
81
+ other => Err(format!(
82
+ "Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
83
+ other
84
+ )),
85
+ }
86
+ }
87
+
88
+ fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
89
+ match value.to_lowercase().as_str() {
90
+ "spaces" => Ok(ListIndentType::Spaces),
91
+ "tabs" => Ok(ListIndentType::Tabs),
92
+ other => Err(format!(
93
+ "Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
94
+ other
95
+ )),
96
+ }
97
+ }
98
+
99
+ fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
100
+ match value.to_lowercase().as_str() {
101
+ "double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
102
+ "html" => Ok(HighlightStyle::Html),
103
+ "bold" => Ok(HighlightStyle::Bold),
104
+ "none" => Ok(HighlightStyle::None),
105
+ other => Err(format!(
106
+ "Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
107
+ other
108
+ )),
109
+ }
110
+ }
111
+
112
+ fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
113
+ match value.to_lowercase().as_str() {
114
+ "normalized" => Ok(WhitespaceMode::Normalized),
115
+ "strict" => Ok(WhitespaceMode::Strict),
116
+ other => Err(format!(
117
+ "Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
118
+ other
119
+ )),
120
+ }
121
+ }
122
+
123
+ fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
124
+ match value.to_lowercase().as_str() {
125
+ "spaces" => Ok(NewlineStyle::Spaces),
126
+ "backslash" => Ok(NewlineStyle::Backslash),
127
+ other => Err(format!(
128
+ "Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
129
+ other
130
+ )),
131
+ }
132
+ }
133
+
134
+ fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
135
+ match value.to_lowercase().as_str() {
136
+ "indented" => Ok(CodeBlockStyle::Indented),
137
+ "backticks" => Ok(CodeBlockStyle::Backticks),
138
+ "tildes" => Ok(CodeBlockStyle::Tildes),
139
+ other => Err(format!(
140
+ "Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
141
+ other
142
+ )),
143
+ }
144
+ }
145
+
146
+ fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
147
+ match value.to_lowercase().as_str() {
148
+ "minimal" => Ok(PreprocessingPreset::Minimal),
149
+ "standard" => Ok(PreprocessingPreset::Standard),
150
+ "aggressive" => Ok(PreprocessingPreset::Aggressive),
151
+ other => Err(format!(
152
+ "Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
153
+ other
154
+ )),
155
+ }
156
+ }
157
+
158
+ fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
159
+ let mut opts = ConversionOptions::default();
160
+ let obj = value
161
+ .as_object()
162
+ .ok_or_else(|| "html_options must be an object".to_string())?;
163
+
164
+ if let Some(val) = obj.get("heading_style") {
165
+ opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
166
+ }
167
+
168
+ if let Some(val) = obj.get("list_indent_type") {
169
+ opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
170
+ }
171
+
172
+ if let Some(val) = obj.get("list_indent_width") {
173
+ opts.list_indent_width = val
174
+ .as_u64()
175
+ .map(|v| v as usize)
176
+ .ok_or_else(|| "list_indent_width must be an integer".to_string())?;
177
+ }
178
+
179
+ if let Some(val) = obj.get("bullets") {
180
+ opts.bullets = val
181
+ .as_str()
182
+ .map(str::to_string)
183
+ .ok_or_else(|| "bullets must be a string".to_string())?;
184
+ }
185
+
186
+ if let Some(val) = obj.get("strong_em_symbol") {
187
+ let symbol = val
188
+ .as_str()
189
+ .ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
190
+ let mut chars = symbol.chars();
191
+ opts.strong_em_symbol = chars
192
+ .next()
193
+ .ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
194
+ }
195
+
196
+ if let Some(val) = obj.get("escape_asterisks") {
197
+ opts.escape_asterisks = val
198
+ .as_bool()
199
+ .ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
200
+ }
201
+ if let Some(val) = obj.get("escape_underscores") {
202
+ opts.escape_underscores = val
203
+ .as_bool()
204
+ .ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
205
+ }
206
+ if let Some(val) = obj.get("escape_misc") {
207
+ opts.escape_misc = val
208
+ .as_bool()
209
+ .ok_or_else(|| "escape_misc must be a boolean".to_string())?;
210
+ }
211
+ if let Some(val) = obj.get("escape_ascii") {
212
+ opts.escape_ascii = val
213
+ .as_bool()
214
+ .ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
215
+ }
216
+
217
+ if let Some(val) = obj.get("code_language") {
218
+ opts.code_language = val
219
+ .as_str()
220
+ .map(str::to_string)
221
+ .ok_or_else(|| "code_language must be a string".to_string())?;
222
+ }
223
+
224
+ if let Some(val) = obj.get("autolinks") {
225
+ opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
226
+ }
227
+
228
+ if let Some(val) = obj.get("default_title") {
229
+ opts.default_title = val
230
+ .as_bool()
231
+ .ok_or_else(|| "default_title must be a boolean".to_string())?;
232
+ }
233
+
234
+ if let Some(val) = obj.get("br_in_tables") {
235
+ opts.br_in_tables = val
236
+ .as_bool()
237
+ .ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
238
+ }
239
+
240
+ if let Some(val) = obj.get("hocr_spatial_tables") {
241
+ opts.hocr_spatial_tables = val
242
+ .as_bool()
243
+ .ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
244
+ }
245
+
246
+ if let Some(val) = obj.get("highlight_style") {
247
+ opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
248
+ }
249
+
250
+ if let Some(val) = obj.get("extract_metadata") {
251
+ opts.extract_metadata = val
252
+ .as_bool()
253
+ .ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
254
+ }
255
+
256
+ if let Some(val) = obj.get("whitespace_mode") {
257
+ opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
258
+ }
259
+
260
+ if let Some(val) = obj.get("strip_newlines") {
261
+ opts.strip_newlines = val
262
+ .as_bool()
263
+ .ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
264
+ }
265
+
266
+ if let Some(val) = obj.get("wrap") {
267
+ opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
268
+ }
269
+
270
+ if let Some(val) = obj.get("wrap_width") {
271
+ opts.wrap_width = val
272
+ .as_u64()
273
+ .map(|v| v as usize)
274
+ .ok_or_else(|| "wrap_width must be an integer".to_string())?;
275
+ }
276
+
277
+ if let Some(val) = obj.get("convert_as_inline") {
278
+ opts.convert_as_inline = val
279
+ .as_bool()
280
+ .ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
281
+ }
282
+
283
+ if let Some(val) = obj.get("sub_symbol") {
284
+ opts.sub_symbol = val
285
+ .as_str()
286
+ .map(str::to_string)
287
+ .ok_or_else(|| "sub_symbol must be a string".to_string())?;
288
+ }
289
+
290
+ if let Some(val) = obj.get("sup_symbol") {
291
+ opts.sup_symbol = val
292
+ .as_str()
293
+ .map(str::to_string)
294
+ .ok_or_else(|| "sup_symbol must be a string".to_string())?;
295
+ }
296
+
297
+ if let Some(val) = obj.get("newline_style") {
298
+ opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
299
+ }
300
+
301
+ if let Some(val) = obj.get("code_block_style") {
302
+ opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
303
+ }
304
+
305
+ if let Some(val) = obj.get("keep_inline_images_in") {
306
+ opts.keep_inline_images_in = val
307
+ .as_array()
308
+ .ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
309
+ .iter()
310
+ .map(|v| {
311
+ v.as_str()
312
+ .map(str::to_string)
313
+ .ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
314
+ })
315
+ .collect::<std::result::Result<Vec<_>, _>>()?;
316
+ }
317
+
318
+ if let Some(val) = obj.get("encoding") {
319
+ opts.encoding = val
320
+ .as_str()
321
+ .map(str::to_string)
322
+ .ok_or_else(|| "encoding must be a string".to_string())?;
323
+ }
324
+
325
+ if let Some(val) = obj.get("debug") {
326
+ opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
327
+ }
328
+
329
+ if let Some(val) = obj.get("strip_tags") {
330
+ opts.strip_tags = val
331
+ .as_array()
332
+ .ok_or_else(|| "strip_tags must be an array".to_string())?
333
+ .iter()
334
+ .map(|v| {
335
+ v.as_str()
336
+ .map(str::to_string)
337
+ .ok_or_else(|| "strip_tags entries must be strings".to_string())
338
+ })
339
+ .collect::<std::result::Result<Vec<_>, _>>()?;
340
+ }
341
+
342
+ if let Some(val) = obj.get("preserve_tags") {
343
+ opts.preserve_tags = val
344
+ .as_array()
345
+ .ok_or_else(|| "preserve_tags must be an array".to_string())?
346
+ .iter()
347
+ .map(|v| {
348
+ v.as_str()
349
+ .map(str::to_string)
350
+ .ok_or_else(|| "preserve_tags entries must be strings".to_string())
351
+ })
352
+ .collect::<std::result::Result<Vec<_>, _>>()?;
353
+ }
354
+
355
+ if let Some(val) = obj.get("preprocessing") {
356
+ let pre = val
357
+ .as_object()
358
+ .ok_or_else(|| "preprocessing must be an object".to_string())?;
359
+ let mut preprocessing = opts.preprocessing.clone();
360
+
361
+ if let Some(v) = pre.get("enabled") {
362
+ preprocessing.enabled = v
363
+ .as_bool()
364
+ .ok_or_else(|| "preprocessing.enabled must be a boolean".to_string())?;
365
+ }
366
+
367
+ if let Some(v) = pre.get("preset") {
368
+ let preset = v
369
+ .as_str()
370
+ .ok_or_else(|| "preprocessing.preset must be a string".to_string())?;
371
+ preprocessing.preset = parse_preprocessing_preset(preset)?;
372
+ }
373
+
374
+ if let Some(v) = pre.get("remove_navigation") {
375
+ preprocessing.remove_navigation = v
376
+ .as_bool()
377
+ .ok_or_else(|| "preprocessing.remove_navigation must be a boolean".to_string())?;
378
+ }
379
+
380
+ if let Some(v) = pre.get("remove_forms") {
381
+ preprocessing.remove_forms = v
382
+ .as_bool()
383
+ .ok_or_else(|| "preprocessing.remove_forms must be a boolean".to_string())?;
384
+ }
385
+
386
+ opts.preprocessing = preprocessing;
387
+ }
388
+
389
+ Ok(opts)
390
+ }
391
+
392
+ let value: serde_json::Value =
393
+ serde_json::from_str(config_str).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
394
+
395
+ let html_options = value.get("html_options").map(parse_html_options).transpose()?;
396
+
397
+ let mut config: ExtractionConfig =
398
+ serde_json::from_value(value).map_err(|e| format!("Failed to parse config JSON: {}", e))?;
399
+
400
+ if let Some(options) = html_options {
401
+ config.html_options = Some(options);
402
+ }
403
+
404
+ Ok(config)
405
+ }
406
+
407
+ /// RAII guard for C strings to prevent memory leaks on error paths.
408
+ ///
409
+ /// This wrapper ensures that if any allocation fails during the construction
410
+ /// of a CExtractionResult, all previously allocated C strings are properly freed.
411
+ /// The Drop implementation handles cleanup automatically when the guard goes out of scope.
412
+ struct CStringGuard {
413
+ ptr: *mut c_char,
414
+ }
415
+
416
+ impl CStringGuard {
417
+ /// Create a new guard from a CString, transferring ownership of the raw pointer
418
+ fn new(s: CString) -> Self {
419
+ Self { ptr: s.into_raw() }
420
+ }
421
+
422
+ /// Transfer ownership of the raw pointer to the caller, preventing cleanup
423
+ fn into_raw(mut self) -> *mut c_char {
424
+ let ptr = self.ptr;
425
+ self.ptr = ptr::null_mut();
426
+ ptr
427
+ }
428
+ }
429
+
430
+ impl Drop for CStringGuard {
431
+ fn drop(&mut self) {
432
+ if !self.ptr.is_null() {
433
+ unsafe { drop(CString::from_raw(self.ptr)) };
434
+ }
435
+ }
436
+ }
437
+
438
+ /// C-compatible extraction result structure
439
+ ///
440
+ /// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
441
+ /// Field order: 11 pointers (8 bytes each) + 1 bool + 7 bytes padding = 96 bytes total
442
+ #[repr(C)]
443
+ pub struct CExtractionResult {
444
+ /// Extracted text content (null-terminated UTF-8 string, must be freed with kreuzberg_free_string)
445
+ pub content: *mut c_char,
446
+ /// Detected MIME type (null-terminated string, must be freed with kreuzberg_free_string)
447
+ pub mime_type: *mut c_char,
448
+ /// Document language (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
449
+ pub language: *mut c_char,
450
+ /// Document date (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
451
+ pub date: *mut c_char,
452
+ /// Document subject (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
453
+ pub subject: *mut c_char,
454
+ /// Tables as JSON array (null-terminated string, or NULL if no tables, must be freed with kreuzberg_free_string)
455
+ pub tables_json: *mut c_char,
456
+ /// Detected languages as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
457
+ pub detected_languages_json: *mut c_char,
458
+ /// Metadata as JSON object (null-terminated string, or NULL if no metadata, must be freed with kreuzberg_free_string)
459
+ pub metadata_json: *mut c_char,
460
+ /// Text chunks as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
461
+ pub chunks_json: *mut c_char,
462
+ /// Extracted images as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
463
+ pub images_json: *mut c_char,
464
+ /// Page structure as JSON object (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
465
+ pub page_structure_json: *mut c_char,
466
+ /// Whether extraction was successful
467
+ pub success: bool,
468
+ /// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
469
+ _padding1: [u8; 7],
470
+ }
471
+
472
+ /// Helper function to convert ExtractionResult to CExtractionResult
473
+ ///
474
+ /// Uses RAII guards to prevent memory leaks if any string allocation fails.
475
+ /// All allocated C strings are automatically freed if an error occurs before
476
+ /// the final result is constructed.
477
+ fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*mut CExtractionResult, String> {
478
+ let ExtractionResult {
479
+ content,
480
+ mime_type,
481
+ metadata,
482
+ tables,
483
+ detected_languages,
484
+ chunks,
485
+ images,
486
+ pages,
487
+ } = result;
488
+
489
+ let content_guard =
490
+ CStringGuard::new(CString::new(content).map_err(|e| format!("Failed to convert content to C string: {}", e))?);
491
+
492
+ let mime_type_guard = CStringGuard::new(
493
+ CString::new(mime_type).map_err(|e| format!("Failed to convert MIME type to C string: {}", e))?,
494
+ );
495
+
496
+ let language_guard = match &metadata.language {
497
+ Some(lang) => Some(CStringGuard::new(
498
+ CString::new(lang.as_str()).map_err(|e| format!("Failed to convert language to C string: {}", e))?,
499
+ )),
500
+ None => None,
501
+ };
502
+
503
+ let date_guard = match &metadata.date {
504
+ Some(d) => Some(CStringGuard::new(
505
+ CString::new(d.as_str()).map_err(|e| format!("Failed to convert date to C string: {}", e))?,
506
+ )),
507
+ None => None,
508
+ };
509
+
510
+ let subject_guard = match &metadata.subject {
511
+ Some(subj) => Some(CStringGuard::new(
512
+ CString::new(subj.as_str()).map_err(|e| format!("Failed to convert subject to C string: {}", e))?,
513
+ )),
514
+ None => None,
515
+ };
516
+
517
+ let tables_json_guard = if !tables.is_empty() {
518
+ let json = serde_json::to_string(&tables).map_err(|e| format!("Failed to serialize tables to JSON: {}", e))?;
519
+ Some(CStringGuard::new(CString::new(json).map_err(|e| {
520
+ format!("Failed to convert tables JSON to C string: {}", e)
521
+ })?))
522
+ } else {
523
+ None
524
+ };
525
+
526
+ let detected_languages_json_guard = match detected_languages {
527
+ Some(langs) if !langs.is_empty() => {
528
+ let json = serde_json::to_string(&langs)
529
+ .map_err(|e| format!("Failed to serialize detected languages to JSON: {}", e))?;
530
+ Some(CStringGuard::new(CString::new(json).map_err(|e| {
531
+ format!("Failed to convert detected languages JSON to C string: {}", e)
532
+ })?))
533
+ }
534
+ _ => None,
535
+ };
536
+
537
+ let metadata_json_guard = {
538
+ let json =
539
+ serde_json::to_string(&metadata).map_err(|e| format!("Failed to serialize metadata to JSON: {}", e))?;
540
+ Some(CStringGuard::new(CString::new(json).map_err(|e| {
541
+ format!("Failed to convert metadata JSON to C string: {}", e)
542
+ })?))
543
+ };
544
+
545
+ let chunks_json_guard = match chunks {
546
+ Some(chunks) if !chunks.is_empty() => {
547
+ let json =
548
+ serde_json::to_string(&chunks).map_err(|e| format!("Failed to serialize chunks to JSON: {}", e))?;
549
+ Some(CStringGuard::new(CString::new(json).map_err(|e| {
550
+ format!("Failed to convert chunks JSON to C string: {}", e)
551
+ })?))
552
+ }
553
+ _ => None,
554
+ };
555
+
556
+ let images_json_guard = match images {
557
+ Some(images) if !images.is_empty() => {
558
+ let json =
559
+ serde_json::to_string(&images).map_err(|e| format!("Failed to serialize images to JSON: {}", e))?;
560
+ Some(CStringGuard::new(CString::new(json).map_err(|e| {
561
+ format!("Failed to convert images JSON to C string: {}", e)
562
+ })?))
563
+ }
564
+ _ => None,
565
+ };
566
+
567
+ let page_structure_json_guard = match &metadata.pages {
568
+ Some(page_structure) => {
569
+ let json = serde_json::to_string(&page_structure)
570
+ .map_err(|e| format!("Failed to serialize page structure to JSON: {}", e))?;
571
+ Some(CStringGuard::new(CString::new(json).map_err(|e| {
572
+ format!("Failed to convert page structure JSON to C string: {}", e)
573
+ })?))
574
+ }
575
+ _ => None,
576
+ };
577
+
578
+ let _pages_json_guard = match pages {
579
+ Some(pages) if !pages.is_empty() => {
580
+ let json =
581
+ serde_json::to_string(&pages).map_err(|e| format!("Failed to serialize pages to JSON: {}", e))?;
582
+ Some(CStringGuard::new(CString::new(json).map_err(|e| {
583
+ format!("Failed to convert pages JSON to C string: {}", e)
584
+ })?))
585
+ }
586
+ _ => None,
587
+ };
588
+
589
+ Ok(Box::into_raw(Box::new(CExtractionResult {
590
+ content: content_guard.into_raw(),
591
+ mime_type: mime_type_guard.into_raw(),
592
+ language: language_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
593
+ date: date_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
594
+ subject: subject_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
595
+ tables_json: tables_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
596
+ detected_languages_json: detected_languages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
597
+ metadata_json: metadata_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
598
+ chunks_json: chunks_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
599
+ images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
600
+ page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
601
+ success: true,
602
+ _padding1: [0u8; 7],
603
+ })))
604
+ }
605
+
606
+ /// Extract text and metadata from a file (synchronous).
607
+ ///
608
+ /// # Safety
609
+ ///
610
+ /// - `file_path` must be a valid null-terminated C string
611
+ /// - The returned pointer must be freed with `kreuzberg_free_result`
612
+ /// - Returns NULL on error (check `kreuzberg_last_error` for details)
613
+ ///
614
+ /// # Example (C)
615
+ ///
616
+ /// ```c
617
+ /// const char* path = "/path/to/document.pdf";
618
+ /// CExtractionResult* result = kreuzberg_extract_file_sync(path);
619
+ /// if (result != NULL && result->success) {
620
+ /// printf("Content: %s\n", result->content);
621
+ /// printf("MIME: %s\n", result->mime_type);
622
+ /// kreuzberg_free_result(result);
623
+ /// } else {
624
+ /// const char* error = kreuzberg_last_error();
625
+ /// printf("Error: %s\n", error);
626
+ /// }
627
+ /// ```
628
+ #[unsafe(no_mangle)]
629
+ pub unsafe extern "C" fn kreuzberg_extract_file_sync(file_path: *const c_char) -> *mut CExtractionResult {
630
+ ffi_panic_guard!("kreuzberg_extract_file_sync", {
631
+ clear_last_error();
632
+
633
+ if file_path.is_null() {
634
+ set_last_error("file_path cannot be NULL".to_string());
635
+ return ptr::null_mut();
636
+ }
637
+
638
+ let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
639
+ Ok(s) => s,
640
+ Err(e) => {
641
+ set_last_error(format!("Invalid UTF-8 in file path: {}", e));
642
+ return ptr::null_mut();
643
+ }
644
+ };
645
+
646
+ let path = Path::new(path_str);
647
+ let config = ExtractionConfig::default();
648
+
649
+ match kreuzberg::extract_file_sync(path, None, &config) {
650
+ Ok(result) => match to_c_extraction_result(result) {
651
+ Ok(ptr) => ptr,
652
+ Err(e) => {
653
+ set_last_error(e);
654
+ ptr::null_mut()
655
+ }
656
+ },
657
+ Err(e) => {
658
+ set_last_error(e.to_string());
659
+ ptr::null_mut()
660
+ }
661
+ }
662
+ })
663
+ }
664
+
665
+ /// Detect MIME type from a file path.
666
+ ///
667
+ /// # Safety
668
+ ///
669
+ /// - `file_path` must be a valid null-terminated C string
670
+ /// - The returned string must be freed with `kreuzberg_free_string`
671
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
672
+ #[unsafe(no_mangle)]
673
+ pub unsafe extern "C" fn kreuzberg_detect_mime_type(file_path: *const c_char, check_exists: bool) -> *mut c_char {
674
+ ffi_panic_guard!("kreuzberg_detect_mime_type", {
675
+ clear_last_error();
676
+
677
+ if file_path.is_null() {
678
+ set_last_error("file_path cannot be NULL".to_string());
679
+ return ptr::null_mut();
680
+ }
681
+
682
+ let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
683
+ Ok(s) => s,
684
+ Err(e) => {
685
+ set_last_error(format!("Invalid UTF-8 in file path: {}", e));
686
+ return ptr::null_mut();
687
+ }
688
+ };
689
+
690
+ match kreuzberg::core::mime::detect_mime_type(path_str, check_exists) {
691
+ Ok(mime) => match string_to_c_string(mime) {
692
+ Ok(ptr) => ptr,
693
+ Err(e) => {
694
+ set_last_error(e);
695
+ ptr::null_mut()
696
+ }
697
+ },
698
+ Err(e) => {
699
+ set_last_error(e.to_string());
700
+ ptr::null_mut()
701
+ }
702
+ }
703
+ })
704
+ }
705
+
706
+ /// Validate that a MIME type is supported by Kreuzberg.
707
+ ///
708
+ /// # Safety
709
+ ///
710
+ /// - `mime_type` must be a valid null-terminated C string
711
+ /// - The returned string must be freed with `kreuzberg_free_string`
712
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
713
+ #[unsafe(no_mangle)]
714
+ pub unsafe extern "C" fn kreuzberg_validate_mime_type(mime_type: *const c_char) -> *mut c_char {
715
+ ffi_panic_guard!("kreuzberg_validate_mime_type", {
716
+ clear_last_error();
717
+
718
+ if mime_type.is_null() {
719
+ set_last_error("mime_type cannot be NULL".to_string());
720
+ return ptr::null_mut();
721
+ }
722
+
723
+ let mime_type_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
724
+ Ok(s) => s,
725
+ Err(e) => {
726
+ set_last_error(format!("Invalid UTF-8 in mime_type: {}", e));
727
+ return ptr::null_mut();
728
+ }
729
+ };
730
+
731
+ match kreuzberg::validate_mime_type(mime_type_str) {
732
+ Ok(validated) => match string_to_c_string(validated) {
733
+ Ok(ptr) => ptr,
734
+ Err(e) => {
735
+ set_last_error(e);
736
+ ptr::null_mut()
737
+ }
738
+ },
739
+ Err(e) => {
740
+ set_last_error(e.to_string());
741
+ ptr::null_mut()
742
+ }
743
+ }
744
+ })
745
+ }
746
+
747
+ #[derive(Serialize)]
748
+ #[cfg(not(all(windows, target_env = "gnu")))]
749
+ struct SerializableEmbeddingPreset<'a> {
750
+ name: &'a str,
751
+ chunk_size: usize,
752
+ overlap: usize,
753
+ model_name: String,
754
+ dimensions: usize,
755
+ description: &'a str,
756
+ }
757
+
758
+ /// List available embedding preset names.
759
+ ///
760
+ /// # Safety
761
+ ///
762
+ /// - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
763
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
764
+ #[cfg(not(all(windows, target_env = "gnu")))]
765
+ #[unsafe(no_mangle)]
766
+ pub unsafe extern "C" fn kreuzberg_list_embedding_presets() -> *mut c_char {
767
+ ffi_panic_guard!("kreuzberg_list_embedding_presets", {
768
+ clear_last_error();
769
+
770
+ let presets = kreuzberg::embeddings::list_presets();
771
+ match serde_json::to_string(&presets) {
772
+ Ok(json) => match string_to_c_string(json) {
773
+ Ok(ptr) => ptr,
774
+ Err(e) => {
775
+ set_last_error(e);
776
+ ptr::null_mut()
777
+ }
778
+ },
779
+ Err(e) => {
780
+ set_last_error(format!("Failed to serialize presets: {}", e));
781
+ ptr::null_mut()
782
+ }
783
+ }
784
+ })
785
+ }
786
+
787
+ /// Get a specific embedding preset by name.
788
+ ///
789
+ /// # Safety
790
+ ///
791
+ /// - `name` must be a valid null-terminated C string
792
+ /// - Returned string is JSON object and must be freed with `kreuzberg_free_string`
793
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
794
+ #[cfg(not(all(windows, target_env = "gnu")))]
795
+ #[unsafe(no_mangle)]
796
+ pub unsafe extern "C" fn kreuzberg_get_embedding_preset(name: *const c_char) -> *mut c_char {
797
+ ffi_panic_guard!("kreuzberg_get_embedding_preset", {
798
+ clear_last_error();
799
+
800
+ if name.is_null() {
801
+ set_last_error("preset name cannot be NULL".to_string());
802
+ return ptr::null_mut();
803
+ }
804
+
805
+ let preset_name = match unsafe { CStr::from_ptr(name) }.to_str() {
806
+ Ok(s) => s,
807
+ Err(e) => {
808
+ set_last_error(format!("Invalid UTF-8 in preset name: {}", e));
809
+ return ptr::null_mut();
810
+ }
811
+ };
812
+
813
+ let preset = match kreuzberg::embeddings::get_preset(preset_name) {
814
+ Some(preset) => preset,
815
+ None => {
816
+ set_last_error(format!("Unknown embedding preset: {}", preset_name));
817
+ return ptr::null_mut();
818
+ }
819
+ };
820
+
821
+ let model_name = format!("{:?}", preset.model);
822
+ let serializable = SerializableEmbeddingPreset {
823
+ name: preset.name,
824
+ chunk_size: preset.chunk_size,
825
+ overlap: preset.overlap,
826
+ model_name,
827
+ dimensions: preset.dimensions,
828
+ description: preset.description,
829
+ };
830
+
831
+ match serde_json::to_string(&serializable) {
832
+ Ok(json) => match string_to_c_string(json) {
833
+ Ok(ptr) => ptr,
834
+ Err(e) => {
835
+ set_last_error(e);
836
+ ptr::null_mut()
837
+ }
838
+ },
839
+ Err(e) => {
840
+ set_last_error(format!("Failed to serialize embedding preset: {}", e));
841
+ ptr::null_mut()
842
+ }
843
+ }
844
+ })
845
+ }
846
+
847
+ /// Extract text and metadata from a file with custom configuration (synchronous).
848
+ ///
849
+ /// # Safety
850
+ ///
851
+ /// - `file_path` must be a valid null-terminated C string
852
+ /// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
853
+ /// - The returned pointer must be freed with `kreuzberg_free_result`
854
+ /// - Returns NULL on error (check `kreuzberg_last_error` for details)
855
+ ///
856
+ /// # Example (C)
857
+ ///
858
+ /// ```c
859
+ /// const char* path = "/path/to/document.pdf";
860
+ /// const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
861
+ /// CExtractionResult* result = kreuzberg_extract_file_sync_with_config(path, config);
862
+ /// if (result != NULL && result->success) {
863
+ /// printf("Content: %s\n", result->content);
864
+ /// kreuzberg_free_result(result);
865
+ /// }
866
+ /// ```
867
+ #[unsafe(no_mangle)]
868
+ pub unsafe extern "C" fn kreuzberg_extract_file_sync_with_config(
869
+ file_path: *const c_char,
870
+ config_json: *const c_char,
871
+ ) -> *mut CExtractionResult {
872
+ ffi_panic_guard!("kreuzberg_extract_file_sync_with_config", {
873
+ clear_last_error();
874
+
875
+ if file_path.is_null() {
876
+ set_last_error("file_path cannot be NULL".to_string());
877
+ return ptr::null_mut();
878
+ }
879
+
880
+ let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
881
+ Ok(s) => s,
882
+ Err(e) => {
883
+ set_last_error(format!("Invalid UTF-8 in file path: {}", e));
884
+ return ptr::null_mut();
885
+ }
886
+ };
887
+
888
+ let path = Path::new(path_str);
889
+
890
+ let config = if config_json.is_null() {
891
+ ExtractionConfig::default()
892
+ } else {
893
+ let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
894
+ Ok(s) => s,
895
+ Err(e) => {
896
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
897
+ return ptr::null_mut();
898
+ }
899
+ };
900
+
901
+ match parse_extraction_config_from_json(config_str) {
902
+ Ok(cfg) => cfg,
903
+ Err(e) => {
904
+ set_last_error(e);
905
+ return ptr::null_mut();
906
+ }
907
+ }
908
+ };
909
+
910
+ match kreuzberg::extract_file_sync(path, None, &config) {
911
+ Ok(result) => match to_c_extraction_result(result) {
912
+ Ok(ptr) => ptr,
913
+ Err(e) => {
914
+ set_last_error(e);
915
+ ptr::null_mut()
916
+ }
917
+ },
918
+ Err(e) => {
919
+ set_last_error(e.to_string());
920
+ ptr::null_mut()
921
+ }
922
+ }
923
+ })
924
+ }
925
+
926
+ /// Extract text and metadata from byte array (synchronous).
927
+ ///
928
+ /// # Safety
929
+ ///
930
+ /// - `data` must be a valid pointer to a byte array of length `data_len`
931
+ /// - `mime_type` must be a valid null-terminated C string
932
+ /// - The returned pointer must be freed with `kreuzberg_free_result`
933
+ /// - Returns NULL on error (check `kreuzberg_last_error` for details)
934
+ ///
935
+ /// # Example (C)
936
+ ///
937
+ /// ```c
938
+ /// const uint8_t* data = ...; // Document bytes
939
+ /// size_t len = ...; // Length of data
940
+ /// const char* mime = "application/pdf";
941
+ /// CExtractionResult* result = kreuzberg_extract_bytes_sync(data, len, mime);
942
+ /// if (result != NULL && result->success) {
943
+ /// printf("Content: %s\n", result->content);
944
+ /// kreuzberg_free_result(result);
945
+ /// } else {
946
+ /// const char* error = kreuzberg_last_error();
947
+ /// printf("Error: %s\n", error);
948
+ /// }
949
+ /// ```
950
+ #[unsafe(no_mangle)]
951
+ pub unsafe extern "C" fn kreuzberg_extract_bytes_sync(
952
+ data: *const u8,
953
+ data_len: usize,
954
+ mime_type: *const c_char,
955
+ ) -> *mut CExtractionResult {
956
+ ffi_panic_guard!("kreuzberg_extract_bytes_sync", {
957
+ clear_last_error();
958
+
959
+ if data.is_null() {
960
+ set_last_error("data cannot be NULL".to_string());
961
+ return ptr::null_mut();
962
+ }
963
+
964
+ if mime_type.is_null() {
965
+ set_last_error("mime_type cannot be NULL".to_string());
966
+ return ptr::null_mut();
967
+ }
968
+
969
+ let bytes = unsafe { std::slice::from_raw_parts(data, data_len) };
970
+
971
+ let mime_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
972
+ Ok(s) => s,
973
+ Err(e) => {
974
+ set_last_error(format!("Invalid UTF-8 in MIME type: {}", e));
975
+ return ptr::null_mut();
976
+ }
977
+ };
978
+
979
+ let config = ExtractionConfig::default();
980
+
981
+ match kreuzberg::extract_bytes_sync(bytes, mime_str, &config) {
982
+ Ok(result) => match to_c_extraction_result(result) {
983
+ Ok(ptr) => ptr,
984
+ Err(e) => {
985
+ set_last_error(e);
986
+ ptr::null_mut()
987
+ }
988
+ },
989
+ Err(e) => {
990
+ set_last_error(e.to_string());
991
+ ptr::null_mut()
992
+ }
993
+ }
994
+ })
995
+ }
996
+
997
+ /// Extract text and metadata from byte array with custom configuration (synchronous).
998
+ ///
999
+ /// # Safety
1000
+ ///
1001
+ /// - `data` must be a valid pointer to a byte array of length `data_len`
1002
+ /// - `mime_type` must be a valid null-terminated C string
1003
+ /// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
1004
+ /// - The returned pointer must be freed with `kreuzberg_free_result`
1005
+ /// - Returns NULL on error (check `kreuzberg_last_error` for details)
1006
+ ///
1007
+ /// # Example (C)
1008
+ ///
1009
+ /// ```c
1010
+ /// const uint8_t* data = ...; // Document bytes
1011
+ /// size_t len = ...; // Length of data
1012
+ /// const char* mime = "application/pdf";
1013
+ /// const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
1014
+ /// CExtractionResult* result = kreuzberg_extract_bytes_sync_with_config(data, len, mime, config);
1015
+ /// if (result != NULL && result->success) {
1016
+ /// printf("Content: %s\n", result->content);
1017
+ /// kreuzberg_free_result(result);
1018
+ /// }
1019
+ /// ```
1020
+ #[unsafe(no_mangle)]
1021
+ pub unsafe extern "C" fn kreuzberg_extract_bytes_sync_with_config(
1022
+ data: *const u8,
1023
+ data_len: usize,
1024
+ mime_type: *const c_char,
1025
+ config_json: *const c_char,
1026
+ ) -> *mut CExtractionResult {
1027
+ ffi_panic_guard!("kreuzberg_extract_bytes_sync_with_config", {
1028
+ clear_last_error();
1029
+
1030
+ if data.is_null() {
1031
+ set_last_error("data cannot be NULL".to_string());
1032
+ return ptr::null_mut();
1033
+ }
1034
+
1035
+ if mime_type.is_null() {
1036
+ set_last_error("mime_type cannot be NULL".to_string());
1037
+ return ptr::null_mut();
1038
+ }
1039
+
1040
+ let bytes = unsafe { std::slice::from_raw_parts(data, data_len) };
1041
+
1042
+ let mime_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
1043
+ Ok(s) => s,
1044
+ Err(e) => {
1045
+ set_last_error(format!("Invalid UTF-8 in MIME type: {}", e));
1046
+ return ptr::null_mut();
1047
+ }
1048
+ };
1049
+
1050
+ let config = if config_json.is_null() {
1051
+ ExtractionConfig::default()
1052
+ } else {
1053
+ let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
1054
+ Ok(s) => s,
1055
+ Err(e) => {
1056
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
1057
+ return ptr::null_mut();
1058
+ }
1059
+ };
1060
+
1061
+ match parse_extraction_config_from_json(config_str) {
1062
+ Ok(cfg) => cfg,
1063
+ Err(e) => {
1064
+ set_last_error(e);
1065
+ return ptr::null_mut();
1066
+ }
1067
+ }
1068
+ };
1069
+
1070
+ match kreuzberg::extract_bytes_sync(bytes, mime_str, &config) {
1071
+ Ok(result) => match to_c_extraction_result(result) {
1072
+ Ok(ptr) => ptr,
1073
+ Err(e) => {
1074
+ set_last_error(e);
1075
+ ptr::null_mut()
1076
+ }
1077
+ },
1078
+ Err(e) => {
1079
+ set_last_error(e.to_string());
1080
+ ptr::null_mut()
1081
+ }
1082
+ }
1083
+ })
1084
+ }
1085
+
1086
+ /// C-compatible structure for passing byte array with MIME type in batch operations
1087
+ ///
1088
+ /// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
1089
+ /// Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 pointer (8 bytes) = 24 bytes total
1090
+ #[repr(C)]
1091
+ pub struct CBytesWithMime {
1092
+ /// Pointer to byte data
1093
+ pub data: *const u8,
1094
+ /// Length of byte data
1095
+ pub data_len: usize,
1096
+ /// MIME type as null-terminated C string
1097
+ pub mime_type: *const c_char,
1098
+ }
1099
+
1100
+ /// C-compatible structure for batch extraction results
1101
+ ///
1102
+ /// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
1103
+ /// Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 bool + 7 bytes padding = 24 bytes total
1104
+ #[repr(C)]
1105
+ pub struct CBatchResult {
1106
+ /// Array of extraction results
1107
+ pub results: *mut *mut CExtractionResult,
1108
+ /// Number of results
1109
+ pub count: usize,
1110
+ /// Whether batch operation was successful
1111
+ pub success: bool,
1112
+ /// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
1113
+ _padding2: [u8; 7],
1114
+ }
1115
+
1116
+ /// Batch extract text and metadata from multiple files (synchronous).
1117
+ ///
1118
+ /// # Safety
1119
+ ///
1120
+ /// - `file_paths` must be a valid pointer to an array of null-terminated C strings
1121
+ /// - `count` must be the number of file paths in the array
1122
+ /// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
1123
+ /// - The returned pointer must be freed with `kreuzberg_free_batch_result`
1124
+ /// - Returns NULL on error (check `kreuzberg_last_error` for details)
1125
+ #[unsafe(no_mangle)]
1126
+ pub unsafe extern "C" fn kreuzberg_batch_extract_files_sync(
1127
+ file_paths: *const *const c_char,
1128
+ count: usize,
1129
+ config_json: *const c_char,
1130
+ ) -> *mut CBatchResult {
1131
+ ffi_panic_guard!("kreuzberg_batch_extract_files_sync", {
1132
+ clear_last_error();
1133
+
1134
+ if file_paths.is_null() {
1135
+ set_last_error("file_paths cannot be NULL".to_string());
1136
+ return ptr::null_mut();
1137
+ }
1138
+
1139
+ let config = if config_json.is_null() {
1140
+ ExtractionConfig::default()
1141
+ } else {
1142
+ let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
1143
+ Ok(s) => s,
1144
+ Err(e) => {
1145
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
1146
+ return ptr::null_mut();
1147
+ }
1148
+ };
1149
+
1150
+ match parse_extraction_config_from_json(config_str) {
1151
+ Ok(cfg) => cfg,
1152
+ Err(e) => {
1153
+ set_last_error(e);
1154
+ return ptr::null_mut();
1155
+ }
1156
+ }
1157
+ };
1158
+
1159
+ let mut paths = Vec::with_capacity(count);
1160
+ for i in 0..count {
1161
+ let path_ptr = unsafe { *file_paths.add(i) };
1162
+ if path_ptr.is_null() {
1163
+ set_last_error(format!("File path at index {} is NULL", i));
1164
+ return ptr::null_mut();
1165
+ }
1166
+
1167
+ let path_str = match unsafe { CStr::from_ptr(path_ptr) }.to_str() {
1168
+ Ok(s) => s,
1169
+ Err(e) => {
1170
+ set_last_error(format!("Invalid UTF-8 in file path at index {}: {}", i, e));
1171
+ return ptr::null_mut();
1172
+ }
1173
+ };
1174
+
1175
+ paths.push(Path::new(path_str));
1176
+ }
1177
+
1178
+ match kreuzberg::batch_extract_file_sync(paths, &config) {
1179
+ Ok(results) => {
1180
+ let mut c_results = Vec::with_capacity(results.len());
1181
+ for result in results {
1182
+ match to_c_extraction_result(result) {
1183
+ Ok(ptr) => c_results.push(ptr),
1184
+ Err(e) => {
1185
+ for c_res in c_results {
1186
+ unsafe { kreuzberg_free_result(c_res) };
1187
+ }
1188
+ set_last_error(e);
1189
+ return ptr::null_mut();
1190
+ }
1191
+ }
1192
+ }
1193
+
1194
+ let results_array = c_results.into_boxed_slice();
1195
+ let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
1196
+
1197
+ Box::into_raw(Box::new(CBatchResult {
1198
+ results: results_ptr,
1199
+ count,
1200
+ success: true,
1201
+ _padding2: [0u8; 7],
1202
+ }))
1203
+ }
1204
+ Err(e) => {
1205
+ set_last_error(e.to_string());
1206
+ ptr::null_mut()
1207
+ }
1208
+ }
1209
+ })
1210
+ }
1211
+
1212
+ /// Batch extract text and metadata from multiple byte arrays (synchronous).
1213
+ ///
1214
+ /// # Safety
1215
+ ///
1216
+ /// - `items` must be a valid pointer to an array of CBytesWithMime structures
1217
+ /// - `count` must be the number of items in the array
1218
+ /// - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
1219
+ /// - The returned pointer must be freed with `kreuzberg_free_batch_result`
1220
+ /// - Returns NULL on error (check `kreuzberg_last_error` for details)
1221
+ #[unsafe(no_mangle)]
1222
+ pub unsafe extern "C" fn kreuzberg_batch_extract_bytes_sync(
1223
+ items: *const CBytesWithMime,
1224
+ count: usize,
1225
+ config_json: *const c_char,
1226
+ ) -> *mut CBatchResult {
1227
+ ffi_panic_guard!("kreuzberg_batch_extract_bytes_sync", {
1228
+ clear_last_error();
1229
+
1230
+ if items.is_null() {
1231
+ set_last_error("items cannot be NULL".to_string());
1232
+ return ptr::null_mut();
1233
+ }
1234
+
1235
+ let config = if config_json.is_null() {
1236
+ ExtractionConfig::default()
1237
+ } else {
1238
+ let config_str = match unsafe { CStr::from_ptr(config_json) }.to_str() {
1239
+ Ok(s) => s,
1240
+ Err(e) => {
1241
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
1242
+ return ptr::null_mut();
1243
+ }
1244
+ };
1245
+
1246
+ match parse_extraction_config_from_json(config_str) {
1247
+ Ok(cfg) => cfg,
1248
+ Err(e) => {
1249
+ set_last_error(e);
1250
+ return ptr::null_mut();
1251
+ }
1252
+ }
1253
+ };
1254
+
1255
+ let mut contents = Vec::with_capacity(count);
1256
+ for i in 0..count {
1257
+ let item = unsafe { &*items.add(i) };
1258
+
1259
+ if item.data.is_null() {
1260
+ set_last_error(format!("Data at index {} is NULL", i));
1261
+ return ptr::null_mut();
1262
+ }
1263
+
1264
+ if item.mime_type.is_null() {
1265
+ set_last_error(format!("MIME type at index {} is NULL", i));
1266
+ return ptr::null_mut();
1267
+ }
1268
+
1269
+ let bytes = unsafe { std::slice::from_raw_parts(item.data, item.data_len) };
1270
+
1271
+ let mime_str = match unsafe { CStr::from_ptr(item.mime_type) }.to_str() {
1272
+ Ok(s) => s,
1273
+ Err(e) => {
1274
+ set_last_error(format!("Invalid UTF-8 in MIME type at index {}: {}", i, e));
1275
+ return ptr::null_mut();
1276
+ }
1277
+ };
1278
+
1279
+ contents.push((bytes, mime_str));
1280
+ }
1281
+
1282
+ match kreuzberg::batch_extract_bytes_sync(contents, &config) {
1283
+ Ok(results) => {
1284
+ let mut c_results = Vec::with_capacity(results.len());
1285
+ for result in results {
1286
+ match to_c_extraction_result(result) {
1287
+ Ok(ptr) => c_results.push(ptr),
1288
+ Err(e) => {
1289
+ for c_res in c_results {
1290
+ unsafe { kreuzberg_free_result(c_res) };
1291
+ }
1292
+ set_last_error(e);
1293
+ return ptr::null_mut();
1294
+ }
1295
+ }
1296
+ }
1297
+
1298
+ let results_array = c_results.into_boxed_slice();
1299
+ let results_ptr = Box::into_raw(results_array) as *mut *mut CExtractionResult;
1300
+
1301
+ Box::into_raw(Box::new(CBatchResult {
1302
+ results: results_ptr,
1303
+ count,
1304
+ success: true,
1305
+ _padding2: [0u8; 7],
1306
+ }))
1307
+ }
1308
+ Err(e) => {
1309
+ set_last_error(e.to_string());
1310
+ ptr::null_mut()
1311
+ }
1312
+ }
1313
+ })
1314
+ }
1315
+
1316
+ /// Load an extraction configuration from a TOML/YAML/JSON file.
1317
+ ///
1318
+ /// # Safety
1319
+ ///
1320
+ /// - `file_path` must be a valid null-terminated C string
1321
+ /// - The returned string must be freed with `kreuzberg_free_string`
1322
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
1323
+ #[unsafe(no_mangle)]
1324
+ pub unsafe extern "C" fn kreuzberg_load_extraction_config_from_file(file_path: *const c_char) -> *mut c_char {
1325
+ ffi_panic_guard!("kreuzberg_load_extraction_config_from_file", {
1326
+ clear_last_error();
1327
+
1328
+ if file_path.is_null() {
1329
+ set_last_error("file_path cannot be NULL".to_string());
1330
+ return ptr::null_mut();
1331
+ }
1332
+
1333
+ let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
1334
+ Ok(s) => s,
1335
+ Err(e) => {
1336
+ set_last_error(format!("Invalid UTF-8 in file path: {}", e));
1337
+ return ptr::null_mut();
1338
+ }
1339
+ };
1340
+
1341
+ match ExtractionConfig::from_file(path_str) {
1342
+ Ok(config) => match serde_json::to_string(&config) {
1343
+ Ok(json) => match CString::new(json) {
1344
+ Ok(cstr) => cstr.into_raw(),
1345
+ Err(e) => {
1346
+ set_last_error(format!("Failed to create C string: {}", e));
1347
+ ptr::null_mut()
1348
+ }
1349
+ },
1350
+ Err(e) => {
1351
+ set_last_error(format!("Failed to serialize config to JSON: {}", e));
1352
+ ptr::null_mut()
1353
+ }
1354
+ },
1355
+ Err(e) => {
1356
+ set_last_error(e.to_string());
1357
+ ptr::null_mut()
1358
+ }
1359
+ }
1360
+ })
1361
+ }
1362
+
1363
+ /// Free a batch result returned by batch extraction functions.
1364
+ ///
1365
+ /// # Safety
1366
+ ///
1367
+ /// - `batch_result` must be a pointer previously returned by a batch extraction function
1368
+ /// - `batch_result` can be NULL (no-op)
1369
+ /// - `batch_result` must not be used after this call
1370
+ /// - All results and strings within the batch result will be freed automatically
1371
+ #[unsafe(no_mangle)]
1372
+ pub unsafe extern "C" fn kreuzberg_free_batch_result(batch_result: *mut CBatchResult) {
1373
+ if !batch_result.is_null() {
1374
+ let batch = unsafe { Box::from_raw(batch_result) };
1375
+
1376
+ // NOTE: Do not free individual results here - calling code is responsible for that.
1377
+
1378
+ if !batch.results.is_null() {
1379
+ unsafe {
1380
+ let _results_array = Box::from_raw(std::ptr::slice_from_raw_parts_mut(batch.results, batch.count));
1381
+ };
1382
+ }
1383
+ }
1384
+ }
1385
+
1386
+ /// Free a string returned by Kreuzberg functions.
1387
+ ///
1388
+ /// # Safety
1389
+ ///
1390
+ /// - `s` must be a string previously returned by a Kreuzberg function
1391
+ /// - `s` can be NULL (no-op)
1392
+ /// - `s` must not be used after this call
1393
+ ///
1394
+ /// # Example (C)
1395
+ ///
1396
+ /// ```c
1397
+ /// char* str = result->content;
1398
+ /// kreuzberg_free_string(str);
1399
+ /// // str is now invalid
1400
+ /// ```
1401
+ #[unsafe(no_mangle)]
1402
+ pub unsafe extern "C" fn kreuzberg_free_string(s: *mut c_char) {
1403
+ if !s.is_null() {
1404
+ unsafe { drop(CString::from_raw(s)) };
1405
+ }
1406
+ }
1407
+
1408
+ /// Clone a null-terminated string using Rust's allocator.
1409
+ ///
1410
+ /// # Safety
1411
+ ///
1412
+ /// - `s` must be a valid null-terminated UTF-8 string
1413
+ /// - Returned pointer must be freed with `kreuzberg_free_string`
1414
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
1415
+ #[unsafe(no_mangle)]
1416
+ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char {
1417
+ ffi_panic_guard!("kreuzberg_clone_string", {
1418
+ clear_last_error();
1419
+
1420
+ if s.is_null() {
1421
+ set_last_error("Input string cannot be NULL".to_string());
1422
+ return ptr::null_mut();
1423
+ }
1424
+
1425
+ let raw = match unsafe { CStr::from_ptr(s) }.to_str() {
1426
+ Ok(val) => val,
1427
+ Err(e) => {
1428
+ set_last_error(format!("Invalid UTF-8 in string: {}", e));
1429
+ return ptr::null_mut();
1430
+ }
1431
+ };
1432
+
1433
+ match CString::new(raw) {
1434
+ Ok(cstr) => cstr.into_raw(),
1435
+ Err(e) => {
1436
+ set_last_error(format!("Failed to clone string: {}", e));
1437
+ ptr::null_mut()
1438
+ }
1439
+ }
1440
+ })
1441
+ }
1442
+
1443
+ /// Free an extraction result returned by `kreuzberg_extract_file_sync`.
1444
+ ///
1445
+ /// # Safety
1446
+ ///
1447
+ /// - `result` must be a pointer previously returned by `kreuzberg_extract_file_sync`
1448
+ /// - `result` can be NULL (no-op)
1449
+ /// - `result` must not be used after this call
1450
+ /// - All string fields within the result will be freed automatically
1451
+ ///
1452
+ /// # Example (C)
1453
+ ///
1454
+ /// ```c
1455
+ /// CExtractionResult* result = kreuzberg_extract_file_sync(path);
1456
+ /// // Use result...
1457
+ /// kreuzberg_free_result(result);
1458
+ /// // result is now invalid
1459
+ /// ```
1460
+ #[unsafe(no_mangle)]
1461
+ pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
1462
+ if !result.is_null() {
1463
+ let result_box = unsafe { Box::from_raw(result) };
1464
+
1465
+ if !result_box.content.is_null() {
1466
+ unsafe { drop(CString::from_raw(result_box.content)) };
1467
+ }
1468
+ if !result_box.mime_type.is_null() {
1469
+ unsafe { drop(CString::from_raw(result_box.mime_type)) };
1470
+ }
1471
+ if !result_box.language.is_null() {
1472
+ unsafe { drop(CString::from_raw(result_box.language)) };
1473
+ }
1474
+ if !result_box.date.is_null() {
1475
+ unsafe { drop(CString::from_raw(result_box.date)) };
1476
+ }
1477
+ if !result_box.subject.is_null() {
1478
+ unsafe { drop(CString::from_raw(result_box.subject)) };
1479
+ }
1480
+ if !result_box.tables_json.is_null() {
1481
+ unsafe { drop(CString::from_raw(result_box.tables_json)) };
1482
+ }
1483
+ if !result_box.detected_languages_json.is_null() {
1484
+ unsafe { drop(CString::from_raw(result_box.detected_languages_json)) };
1485
+ }
1486
+ if !result_box.metadata_json.is_null() {
1487
+ unsafe { drop(CString::from_raw(result_box.metadata_json)) };
1488
+ }
1489
+ if !result_box.chunks_json.is_null() {
1490
+ unsafe { drop(CString::from_raw(result_box.chunks_json)) };
1491
+ }
1492
+ if !result_box.images_json.is_null() {
1493
+ unsafe { drop(CString::from_raw(result_box.images_json)) };
1494
+ }
1495
+ }
1496
+ }
1497
+
1498
+ /// Get the last error message from a failed operation.
1499
+ ///
1500
+ /// # Safety
1501
+ ///
1502
+ /// - Returns a static string that does not need to be freed
1503
+ /// - Returns NULL if no error has occurred
1504
+ /// - The returned string is valid until the next Kreuzberg function call on the same thread
1505
+ ///
1506
+ /// # Example (C)
1507
+ ///
1508
+ /// ```c
1509
+ /// CExtractionResult* result = kreuzberg_extract_file_sync(path);
1510
+ /// if (result == NULL) {
1511
+ /// const char* error = kreuzberg_last_error();
1512
+ /// if (error != NULL) {
1513
+ /// printf("Error: %s\n", error);
1514
+ /// }
1515
+ /// }
1516
+ /// ```
1517
+ #[unsafe(no_mangle)]
1518
+ pub unsafe extern "C" fn kreuzberg_last_error() -> *const c_char {
1519
+ LAST_ERROR_C_STRING.with(|last| match &*last.borrow() {
1520
+ Some(c_str) => c_str.as_ptr(),
1521
+ None => ptr::null(),
1522
+ })
1523
+ }
1524
+
1525
+ /// Get the error code for the last error.
1526
+ ///
1527
+ /// Returns the error code as an i32. Error codes are defined in ErrorCode enum:
1528
+ /// - 0: Success (no error)
1529
+ /// - 1: GenericError
1530
+ /// - 2: Panic
1531
+ /// - 3: InvalidArgument
1532
+ /// - 4: IoError
1533
+ /// - 5: ParsingError
1534
+ /// - 6: OcrError
1535
+ /// - 7: MissingDependency
1536
+ ///
1537
+ /// # Safety
1538
+ ///
1539
+ /// This function is thread-safe and always safe to call.
1540
+ ///
1541
+ /// # Example (C)
1542
+ ///
1543
+ /// ```c
1544
+ /// CExtractionResult* result = kreuzberg_extract_file_sync(path);
1545
+ /// if (result == NULL) {
1546
+ /// int32_t code = kreuzberg_last_error_code();
1547
+ /// if (code == 2) {
1548
+ /// // A panic occurred
1549
+ /// }
1550
+ /// }
1551
+ /// ```
1552
+ #[unsafe(no_mangle)]
1553
+ pub unsafe extern "C" fn kreuzberg_last_error_code() -> i32 {
1554
+ get_last_error_code() as i32
1555
+ }
1556
+
1557
+ /// Get the panic context for the last error (if it was a panic).
1558
+ ///
1559
+ /// Returns a JSON string containing panic context information, or NULL if
1560
+ /// the last error was not a panic.
1561
+ ///
1562
+ /// The JSON structure contains:
1563
+ /// - file: Source file where panic occurred
1564
+ /// - line: Line number
1565
+ /// - function: Function name
1566
+ /// - message: Panic message
1567
+ /// - timestamp_secs: Unix timestamp (seconds since epoch)
1568
+ ///
1569
+ /// # Safety
1570
+ ///
1571
+ /// The returned string must be freed with kreuzberg_free_string().
1572
+ ///
1573
+ /// # Example (C)
1574
+ ///
1575
+ /// ```c
1576
+ /// CExtractionResult* result = kreuzberg_extract_file_sync(path);
1577
+ /// if (result == NULL && kreuzberg_last_error_code() == 2) {
1578
+ /// const char* context = kreuzberg_last_panic_context();
1579
+ /// if (context != NULL) {
1580
+ /// printf("Panic context: %s\n", context);
1581
+ /// kreuzberg_free_string((char*)context);
1582
+ /// }
1583
+ /// }
1584
+ /// ```
1585
+ #[unsafe(no_mangle)]
1586
+ pub unsafe extern "C" fn kreuzberg_last_panic_context() -> *mut c_char {
1587
+ ffi_panic_guard!("kreuzberg_last_panic_context", {
1588
+ match get_last_panic_context() {
1589
+ Some(ctx) => {
1590
+ use std::time::UNIX_EPOCH;
1591
+
1592
+ let timestamp_secs = ctx
1593
+ .timestamp
1594
+ .duration_since(UNIX_EPOCH)
1595
+ .map(|d| d.as_secs())
1596
+ .unwrap_or(0);
1597
+
1598
+ let json_value = serde_json::json!({
1599
+ "file": ctx.file,
1600
+ "line": ctx.line,
1601
+ "function": ctx.function,
1602
+ "message": ctx.message,
1603
+ "timestamp_secs": timestamp_secs
1604
+ });
1605
+
1606
+ match serde_json::to_string(&json_value) {
1607
+ Ok(json) => match CString::new(json) {
1608
+ Ok(c_str) => c_str.into_raw(),
1609
+ Err(_) => ptr::null_mut(),
1610
+ },
1611
+ Err(_) => ptr::null_mut(),
1612
+ }
1613
+ }
1614
+ None => ptr::null_mut(),
1615
+ }
1616
+ })
1617
+ }
1618
+
1619
+ /// Get the library version string.
1620
+ ///
1621
+ /// # Safety
1622
+ ///
1623
+ /// - Returns a static string that does not need to be freed
1624
+ /// - The returned string is always valid
1625
+ ///
1626
+ /// # Example (C)
1627
+ ///
1628
+ /// ```c
1629
+ /// const char* version = kreuzberg_version();
1630
+ /// printf("Kreuzberg version: %s\n", version);
1631
+ /// ```
1632
+ #[unsafe(no_mangle)]
1633
+ pub unsafe extern "C" fn kreuzberg_version() -> *const c_char {
1634
+ concat!(env!("CARGO_PKG_VERSION"), "\0").as_ptr() as *const c_char
1635
+ }
1636
+
1637
+ /// Type alias for the OCR backend callback function.
1638
+ ///
1639
+ /// # Parameters
1640
+ ///
1641
+ /// - `image_bytes`: Pointer to image data
1642
+ /// - `image_length`: Length of image data in bytes
1643
+ /// - `config_json`: JSON-encoded OcrConfig (null-terminated string)
1644
+ ///
1645
+ /// # Returns
1646
+ ///
1647
+ /// Null-terminated string containing extracted text (must be freed by Rust via kreuzberg_free_string),
1648
+ /// or NULL on error.
1649
+ ///
1650
+ /// # Safety
1651
+ ///
1652
+ /// The callback must:
1653
+ /// - Not store the image_bytes pointer (it's only valid for the duration of the call)
1654
+ /// - Return a valid null-terminated UTF-8 string allocated by the caller
1655
+ /// - Return NULL on error (error message should be retrievable separately)
1656
+ type OcrBackendCallback =
1657
+ unsafe extern "C" fn(image_bytes: *const u8, image_length: usize, config_json: *const c_char) -> *mut c_char;
1658
+
1659
+ fn parse_languages_from_json(languages_json: *const c_char) -> FfiResult<Option<Vec<String>>> {
1660
+ if languages_json.is_null() {
1661
+ return Ok(None);
1662
+ }
1663
+
1664
+ let raw = unsafe { CStr::from_ptr(languages_json) }
1665
+ .to_str()
1666
+ .map_err(|e| format!("Invalid UTF-8 in languages JSON: {}", e))?;
1667
+
1668
+ if raw.trim().is_empty() {
1669
+ return Ok(None);
1670
+ }
1671
+
1672
+ let langs: Vec<String> = serde_json::from_str(raw).map_err(|e| format!("Failed to parse languages JSON: {}", e))?;
1673
+
1674
+ if langs.is_empty() {
1675
+ return Ok(None);
1676
+ }
1677
+
1678
+ let normalized = langs
1679
+ .into_iter()
1680
+ .map(|l| l.trim().to_string())
1681
+ .filter(|l| !l.is_empty())
1682
+ .collect::<Vec<_>>();
1683
+
1684
+ if normalized.is_empty() {
1685
+ return Ok(None);
1686
+ }
1687
+
1688
+ Ok(Some(normalized))
1689
+ }
1690
+
1691
+ /// FFI wrapper for custom OCR backends registered from Java/C.
1692
+ ///
1693
+ /// This struct wraps a C function pointer and implements the OcrBackend trait,
1694
+ /// allowing custom OCR implementations from FFI languages to be registered
1695
+ /// and used within the Rust extraction pipeline.
1696
+ struct FfiOcrBackend {
1697
+ name: String,
1698
+ callback: OcrBackendCallback,
1699
+ supported_languages: Option<Vec<String>>,
1700
+ }
1701
+
1702
+ impl FfiOcrBackend {
1703
+ fn new(name: String, callback: OcrBackendCallback, supported_languages: Option<Vec<String>>) -> Self {
1704
+ Self {
1705
+ name,
1706
+ callback,
1707
+ supported_languages,
1708
+ }
1709
+ }
1710
+ }
1711
+
1712
+ impl Plugin for FfiOcrBackend {
1713
+ fn name(&self) -> &str {
1714
+ &self.name
1715
+ }
1716
+
1717
+ fn version(&self) -> String {
1718
+ "ffi-1.0.0".to_string()
1719
+ }
1720
+
1721
+ fn initialize(&self) -> Result<()> {
1722
+ Ok(())
1723
+ }
1724
+
1725
+ fn shutdown(&self) -> Result<()> {
1726
+ Ok(())
1727
+ }
1728
+ }
1729
+
1730
+ #[async_trait]
1731
+ impl OcrBackend for FfiOcrBackend {
1732
+ async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
1733
+ let config_json = serde_json::to_string(config).map_err(|e| KreuzbergError::Validation {
1734
+ message: format!("Failed to serialize OCR config: {}", e),
1735
+ source: Some(Box::new(e)),
1736
+ })?;
1737
+
1738
+ let callback = self.callback;
1739
+ let image_data = image_bytes.to_vec();
1740
+ let config_json_owned = config_json.clone();
1741
+
1742
+ let result_text = tokio::task::spawn_blocking(move || {
1743
+ let config_cstring = CString::new(config_json_owned).map_err(|e| KreuzbergError::Validation {
1744
+ message: format!("Failed to create C string from config JSON: {}", e),
1745
+ source: Some(Box::new(e)),
1746
+ })?;
1747
+
1748
+ let result_ptr = unsafe { callback(image_data.as_ptr(), image_data.len(), config_cstring.as_ptr()) };
1749
+
1750
+ if result_ptr.is_null() {
1751
+ return Err(KreuzbergError::Ocr {
1752
+ message: "OCR backend returned NULL (operation failed)".to_string(),
1753
+ source: None,
1754
+ });
1755
+ }
1756
+
1757
+ let result_cstr = unsafe { CStr::from_ptr(result_ptr) };
1758
+ let text = result_cstr
1759
+ .to_str()
1760
+ .map_err(|e| KreuzbergError::Ocr {
1761
+ message: format!("OCR backend returned invalid UTF-8: {}", e),
1762
+ source: Some(Box::new(e)),
1763
+ })?
1764
+ .to_string();
1765
+
1766
+ unsafe { kreuzberg_free_string(result_ptr) };
1767
+
1768
+ Ok(text)
1769
+ })
1770
+ .await
1771
+ .map_err(|e| KreuzbergError::Ocr {
1772
+ message: format!("OCR backend task panicked: {}", e),
1773
+ source: Some(Box::new(e)),
1774
+ })??;
1775
+
1776
+ Ok(ExtractionResult {
1777
+ content: result_text,
1778
+ mime_type: "text/plain".to_string(),
1779
+ metadata: kreuzberg::types::Metadata::default(),
1780
+ tables: vec![],
1781
+ detected_languages: None,
1782
+ chunks: None,
1783
+ images: None,
1784
+ pages: None,
1785
+ })
1786
+ }
1787
+
1788
+ fn supports_language(&self, _lang: &str) -> bool {
1789
+ match &self.supported_languages {
1790
+ Some(langs) => langs.iter().any(|candidate| candidate.eq_ignore_ascii_case(_lang)),
1791
+ None => true,
1792
+ }
1793
+ }
1794
+
1795
+ fn backend_type(&self) -> kreuzberg::plugins::OcrBackendType {
1796
+ kreuzberg::plugins::OcrBackendType::Custom
1797
+ }
1798
+ }
1799
+
1800
+ /// Register a custom OCR backend via FFI callback.
1801
+ ///
1802
+ /// # Safety
1803
+ ///
1804
+ /// - `name` must be a valid null-terminated C string
1805
+ /// - `callback` must be a valid function pointer that:
1806
+ /// - Does not store the image_bytes pointer
1807
+ /// - Returns a null-terminated UTF-8 string or NULL on error
1808
+ /// - The returned string must be freeable by kreuzberg_free_string
1809
+ /// - Returns true on success, false on error (check kreuzberg_last_error)
1810
+ ///
1811
+ /// # Example (C)
1812
+ ///
1813
+ /// ```c
1814
+ /// char* my_ocr_backend(const uint8_t* image_bytes, size_t image_length, const char* config_json) {
1815
+ /// // Implement OCR logic here
1816
+ /// // Return allocated string with result, or NULL on error
1817
+ /// return strdup("Extracted text");
1818
+ /// }
1819
+ ///
1820
+ /// bool success = kreuzberg_register_ocr_backend("my-ocr", my_ocr_backend);
1821
+ /// if (!success) {
1822
+ /// const char* error = kreuzberg_last_error();
1823
+ /// printf("Failed to register: %s\n", error);
1824
+ /// }
1825
+ /// ```
1826
+ #[unsafe(no_mangle)]
1827
+ pub unsafe extern "C" fn kreuzberg_register_ocr_backend(name: *const c_char, callback: OcrBackendCallback) -> bool {
1828
+ ffi_panic_guard_bool!("kreuzberg_register_ocr_backend", {
1829
+ clear_last_error();
1830
+
1831
+ if name.is_null() {
1832
+ set_last_error("Backend name cannot be NULL".to_string());
1833
+ return false;
1834
+ }
1835
+
1836
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
1837
+ Ok(s) => s,
1838
+ Err(e) => {
1839
+ set_last_error(format!("Invalid UTF-8 in backend name: {}", e));
1840
+ return false;
1841
+ }
1842
+ };
1843
+
1844
+ if name_str.is_empty() {
1845
+ set_last_error("Plugin name cannot be empty".to_string());
1846
+ return false;
1847
+ }
1848
+
1849
+ if name_str.chars().any(|c| c.is_whitespace()) {
1850
+ set_last_error("Plugin name cannot contain whitespace".to_string());
1851
+ return false;
1852
+ }
1853
+
1854
+ let backend = Arc::new(FfiOcrBackend::new(name_str.to_string(), callback, None));
1855
+
1856
+ let registry = get_ocr_backend_registry();
1857
+ let mut registry_guard = match registry.write() {
1858
+ Ok(guard) => guard,
1859
+ Err(e) => {
1860
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
1861
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
1862
+ return false;
1863
+ }
1864
+ };
1865
+
1866
+ match registry_guard.register(backend) {
1867
+ Ok(()) => true,
1868
+ Err(e) => {
1869
+ set_last_error(format!("Failed to register OCR backend: {}", e));
1870
+ false
1871
+ }
1872
+ }
1873
+ })
1874
+ }
1875
+
1876
+ /// Register a custom OCR backend with explicit language support via FFI callback.
1877
+ ///
1878
+ /// # Safety
1879
+ ///
1880
+ /// - `languages_json` must be a null-terminated JSON array of language codes or NULL
1881
+ /// - See `kreuzberg_register_ocr_backend` for additional safety notes.
1882
+ #[unsafe(no_mangle)]
1883
+ pub unsafe extern "C" fn kreuzberg_register_ocr_backend_with_languages(
1884
+ name: *const c_char,
1885
+ callback: OcrBackendCallback,
1886
+ languages_json: *const c_char,
1887
+ ) -> bool {
1888
+ ffi_panic_guard_bool!("kreuzberg_register_ocr_backend_with_languages", {
1889
+ clear_last_error();
1890
+
1891
+ if name.is_null() {
1892
+ set_last_error("Backend name cannot be NULL".to_string());
1893
+ return false;
1894
+ }
1895
+
1896
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
1897
+ Ok(s) => s,
1898
+ Err(e) => {
1899
+ set_last_error(format!("Invalid UTF-8 in backend name: {}", e));
1900
+ return false;
1901
+ }
1902
+ };
1903
+
1904
+ if name_str.is_empty() {
1905
+ set_last_error("Plugin name cannot be empty".to_string());
1906
+ return false;
1907
+ }
1908
+
1909
+ if name_str.chars().any(|c| c.is_whitespace()) {
1910
+ set_last_error("Plugin name cannot contain whitespace".to_string());
1911
+ return false;
1912
+ }
1913
+
1914
+ let supported_languages = match parse_languages_from_json(languages_json) {
1915
+ Ok(langs) => langs,
1916
+ Err(e) => {
1917
+ set_last_error(e);
1918
+ return false;
1919
+ }
1920
+ };
1921
+
1922
+ let backend = Arc::new(FfiOcrBackend::new(name_str.to_string(), callback, supported_languages));
1923
+
1924
+ let registry = get_ocr_backend_registry();
1925
+ let mut registry_guard = match registry.write() {
1926
+ Ok(guard) => guard,
1927
+ Err(e) => {
1928
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
1929
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
1930
+ return false;
1931
+ }
1932
+ };
1933
+
1934
+ match registry_guard.register(backend) {
1935
+ Ok(()) => true,
1936
+ Err(e) => {
1937
+ set_last_error(format!("Failed to register OCR backend: {}", e));
1938
+ false
1939
+ }
1940
+ }
1941
+ })
1942
+ }
1943
+
1944
+ /// Type alias for the PostProcessor callback function.
1945
+ ///
1946
+ /// # Parameters
1947
+ ///
1948
+ /// - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
1949
+ ///
1950
+ /// # Returns
1951
+ ///
1952
+ /// Null-terminated JSON string containing the processed ExtractionResult
1953
+ /// (must be freed by Rust via kreuzberg_free_string), or NULL on error.
1954
+ ///
1955
+ /// # Safety
1956
+ ///
1957
+ /// The callback must:
1958
+ /// - Not store the result_json pointer (it's only valid for the duration of the call)
1959
+ /// - Return a valid null-terminated UTF-8 JSON string allocated by the caller
1960
+ /// - Return NULL on error (error message should be retrievable separately)
1961
+ type PostProcessorCallback = unsafe extern "C" fn(result_json: *const c_char) -> *mut c_char;
1962
+
1963
+ /// FFI wrapper for custom PostProcessors registered from Java/C.
1964
+ ///
1965
+ /// This struct wraps a C function pointer and implements the PostProcessor trait,
1966
+ /// allowing custom post-processing implementations from FFI languages to be registered
1967
+ /// and used within the Rust extraction pipeline.
1968
+ struct FfiPostProcessor {
1969
+ name: String,
1970
+ callback: PostProcessorCallback,
1971
+ stage: ProcessingStage,
1972
+ }
1973
+
1974
+ impl FfiPostProcessor {
1975
+ fn new(name: String, callback: PostProcessorCallback, stage: ProcessingStage) -> Self {
1976
+ Self { name, callback, stage }
1977
+ }
1978
+ }
1979
+
1980
+ impl Plugin for FfiPostProcessor {
1981
+ fn name(&self) -> &str {
1982
+ &self.name
1983
+ }
1984
+
1985
+ fn version(&self) -> String {
1986
+ "ffi-1.0.0".to_string()
1987
+ }
1988
+
1989
+ fn initialize(&self) -> Result<()> {
1990
+ Ok(())
1991
+ }
1992
+
1993
+ fn shutdown(&self) -> Result<()> {
1994
+ Ok(())
1995
+ }
1996
+ }
1997
+
1998
+ #[async_trait]
1999
+ impl kreuzberg::plugins::PostProcessor for FfiPostProcessor {
2000
+ async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
2001
+ let result_json = serde_json::to_string(&*result).map_err(|e| KreuzbergError::Validation {
2002
+ message: format!("Failed to serialize ExtractionResult: {}", e),
2003
+ source: Some(Box::new(e)),
2004
+ })?;
2005
+
2006
+ let callback = self.callback;
2007
+ let processor_name = self.name.clone();
2008
+ let result_json_owned = result_json.clone();
2009
+
2010
+ let processed_json = tokio::task::spawn_blocking(move || {
2011
+ let result_cstring = CString::new(result_json_owned).map_err(|e| KreuzbergError::Validation {
2012
+ message: format!("Failed to create C string from result JSON: {}", e),
2013
+ source: Some(Box::new(e)),
2014
+ })?;
2015
+
2016
+ let processed_ptr = unsafe { callback(result_cstring.as_ptr()) };
2017
+
2018
+ if processed_ptr.is_null() {
2019
+ return Err(KreuzbergError::Plugin {
2020
+ message: "PostProcessor returned NULL (operation failed)".to_string(),
2021
+ plugin_name: processor_name.clone(),
2022
+ });
2023
+ }
2024
+
2025
+ let processed_cstr = unsafe { CStr::from_ptr(processed_ptr) };
2026
+ let json = processed_cstr
2027
+ .to_str()
2028
+ .map_err(|e| KreuzbergError::Plugin {
2029
+ message: format!("PostProcessor returned invalid UTF-8: {}", e),
2030
+ plugin_name: processor_name.clone(),
2031
+ })?
2032
+ .to_string();
2033
+
2034
+ unsafe { kreuzberg_free_string(processed_ptr) };
2035
+
2036
+ Ok(json)
2037
+ })
2038
+ .await
2039
+ .map_err(|e| KreuzbergError::Plugin {
2040
+ message: format!("PostProcessor task panicked: {}", e),
2041
+ plugin_name: self.name.clone(),
2042
+ })??;
2043
+
2044
+ let processed_result: ExtractionResult =
2045
+ serde_json::from_str(&processed_json).map_err(|e| KreuzbergError::Plugin {
2046
+ message: format!("Failed to deserialize processed result: {}", e),
2047
+ plugin_name: self.name.clone(),
2048
+ })?;
2049
+
2050
+ *result = processed_result;
2051
+
2052
+ Ok(())
2053
+ }
2054
+
2055
+ fn processing_stage(&self) -> kreuzberg::plugins::ProcessingStage {
2056
+ self.stage
2057
+ }
2058
+ }
2059
+
2060
+ fn parse_processing_stage(stage: Option<&str>) -> FfiResult<ProcessingStage> {
2061
+ match stage {
2062
+ Some(value) => match value.to_lowercase().as_str() {
2063
+ "early" => Ok(ProcessingStage::Early),
2064
+ "middle" => Ok(ProcessingStage::Middle),
2065
+ "late" => Ok(ProcessingStage::Late),
2066
+ other => Err(format!(
2067
+ "Invalid processing stage '{}'. Expected one of: early, middle, late",
2068
+ other
2069
+ )),
2070
+ },
2071
+ None => Ok(ProcessingStage::Middle),
2072
+ }
2073
+ }
2074
+
2075
+ /// Register a custom PostProcessor via FFI callback.
2076
+ ///
2077
+ /// # Safety
2078
+ ///
2079
+ /// - `name` must be a valid null-terminated C string
2080
+ /// - `callback` must be a valid function pointer that:
2081
+ /// - Does not store the result_json pointer
2082
+ /// - Returns a null-terminated UTF-8 JSON string or NULL on error
2083
+ /// - The returned string must be freeable by kreuzberg_free_string
2084
+ /// - `priority` determines the order of execution (higher priority runs first)
2085
+ /// - Returns true on success, false on error (check kreuzberg_last_error)
2086
+ ///
2087
+ /// # Example (C)
2088
+ ///
2089
+ /// ```c
2090
+ /// char* my_post_processor(const char* result_json) {
2091
+ /// // Parse result_json, modify it, return JSON string
2092
+ /// return strdup("{\"content\":\"PROCESSED\"}");
2093
+ /// }
2094
+ ///
2095
+ /// bool success = kreuzberg_register_post_processor("my-processor", my_post_processor, 100);
2096
+ /// if (!success) {
2097
+ /// const char* error = kreuzberg_last_error();
2098
+ /// printf("Failed to register: %s\n", error);
2099
+ /// }
2100
+ /// ```
2101
+ #[unsafe(no_mangle)]
2102
+ pub unsafe extern "C" fn kreuzberg_register_post_processor(
2103
+ name: *const c_char,
2104
+ callback: PostProcessorCallback,
2105
+ priority: i32,
2106
+ ) -> bool {
2107
+ ffi_panic_guard_bool!("kreuzberg_register_post_processor", {
2108
+ clear_last_error();
2109
+
2110
+ if name.is_null() {
2111
+ set_last_error("PostProcessor name cannot be NULL".to_string());
2112
+ return false;
2113
+ }
2114
+
2115
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
2116
+ Ok(s) => s,
2117
+ Err(e) => {
2118
+ set_last_error(format!("Invalid UTF-8 in PostProcessor name: {}", e));
2119
+ return false;
2120
+ }
2121
+ };
2122
+
2123
+ if name_str.is_empty() {
2124
+ set_last_error("Plugin name cannot be empty".to_string());
2125
+ return false;
2126
+ }
2127
+
2128
+ if name_str.chars().any(|c| c.is_whitespace()) {
2129
+ set_last_error("Plugin name cannot contain whitespace".to_string());
2130
+ return false;
2131
+ }
2132
+
2133
+ let processor = Arc::new(FfiPostProcessor::new(
2134
+ name_str.to_string(),
2135
+ callback,
2136
+ ProcessingStage::Middle,
2137
+ ));
2138
+
2139
+ let registry = kreuzberg::plugins::registry::get_post_processor_registry();
2140
+ let mut registry_guard = match registry.write() {
2141
+ Ok(guard) => guard,
2142
+ Err(e) => {
2143
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2144
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
2145
+ return false;
2146
+ }
2147
+ };
2148
+
2149
+ match registry_guard.register(processor, priority) {
2150
+ Ok(()) => true,
2151
+ Err(e) => {
2152
+ set_last_error(format!("Failed to register PostProcessor: {}", e));
2153
+ false
2154
+ }
2155
+ }
2156
+ })
2157
+ }
2158
+
2159
+ /// Register a custom PostProcessor with an explicit processing stage.
2160
+ ///
2161
+ /// # Safety
2162
+ ///
2163
+ /// - `name` must be a valid null-terminated C string
2164
+ /// - `stage` must be a valid null-terminated C string containing "early", "middle", or "late"
2165
+ /// - `callback` must be a valid function pointer that:
2166
+ /// - Does not store the result_json pointer
2167
+ /// - Returns a null-terminated UTF-8 JSON string or NULL on error
2168
+ /// - The returned string must be freeable by kreuzberg_free_string
2169
+ /// - `priority` determines the order of execution within the stage (higher priority runs first)
2170
+ /// - Returns true on success, false on error (check kreuzberg_last_error)
2171
+ #[unsafe(no_mangle)]
2172
+ pub unsafe extern "C" fn kreuzberg_register_post_processor_with_stage(
2173
+ name: *const c_char,
2174
+ callback: PostProcessorCallback,
2175
+ priority: i32,
2176
+ stage: *const c_char,
2177
+ ) -> bool {
2178
+ ffi_panic_guard_bool!("kreuzberg_register_post_processor_with_stage", {
2179
+ clear_last_error();
2180
+
2181
+ if name.is_null() {
2182
+ set_last_error("PostProcessor name cannot be NULL".to_string());
2183
+ return false;
2184
+ }
2185
+
2186
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
2187
+ Ok(s) => s,
2188
+ Err(e) => {
2189
+ set_last_error(format!("Invalid UTF-8 in PostProcessor name: {}", e));
2190
+ return false;
2191
+ }
2192
+ };
2193
+
2194
+ if name_str.is_empty() {
2195
+ set_last_error("Plugin name cannot be empty".to_string());
2196
+ return false;
2197
+ }
2198
+
2199
+ if name_str.chars().any(|c| c.is_whitespace()) {
2200
+ set_last_error("Plugin name cannot contain whitespace".to_string());
2201
+ return false;
2202
+ }
2203
+
2204
+ let stage_str = if stage.is_null() {
2205
+ None
2206
+ } else {
2207
+ match unsafe { CStr::from_ptr(stage) }.to_str() {
2208
+ Ok(s) => Some(s),
2209
+ Err(e) => {
2210
+ set_last_error(format!("Invalid UTF-8 in processing stage: {}", e));
2211
+ return false;
2212
+ }
2213
+ }
2214
+ };
2215
+
2216
+ let stage = match parse_processing_stage(stage_str) {
2217
+ Ok(stage) => stage,
2218
+ Err(e) => {
2219
+ set_last_error(e);
2220
+ return false;
2221
+ }
2222
+ };
2223
+
2224
+ let processor = Arc::new(FfiPostProcessor::new(name_str.to_string(), callback, stage));
2225
+
2226
+ let registry = kreuzberg::plugins::registry::get_post_processor_registry();
2227
+ let mut registry_guard = match registry.write() {
2228
+ Ok(guard) => guard,
2229
+ Err(e) => {
2230
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2231
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
2232
+ return false;
2233
+ }
2234
+ };
2235
+
2236
+ match registry_guard.register(processor, priority) {
2237
+ Ok(()) => true,
2238
+ Err(e) => {
2239
+ set_last_error(format!("Failed to register PostProcessor: {}", e));
2240
+ false
2241
+ }
2242
+ }
2243
+ })
2244
+ }
2245
+
2246
+ /// Unregister a PostProcessor by name.
2247
+ ///
2248
+ /// # Safety
2249
+ ///
2250
+ /// - `name` must be a valid null-terminated C string
2251
+ /// - Returns true on success, false on error (check kreuzberg_last_error)
2252
+ ///
2253
+ /// # Example (C)
2254
+ ///
2255
+ /// ```c
2256
+ /// bool success = kreuzberg_unregister_post_processor("my-processor");
2257
+ /// if (!success) {
2258
+ /// const char* error = kreuzberg_last_error();
2259
+ /// printf("Failed to unregister: %s\n", error);
2260
+ /// }
2261
+ /// ```
2262
+ #[unsafe(no_mangle)]
2263
+ pub unsafe extern "C" fn kreuzberg_unregister_post_processor(name: *const c_char) -> bool {
2264
+ ffi_panic_guard_bool!("kreuzberg_unregister_post_processor", {
2265
+ clear_last_error();
2266
+
2267
+ if name.is_null() {
2268
+ set_last_error("PostProcessor name cannot be NULL".to_string());
2269
+ return false;
2270
+ }
2271
+
2272
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
2273
+ Ok(s) => s,
2274
+ Err(e) => {
2275
+ set_last_error(format!("Invalid UTF-8 in PostProcessor name: {}", e));
2276
+ return false;
2277
+ }
2278
+ };
2279
+
2280
+ let registry = kreuzberg::plugins::registry::get_post_processor_registry();
2281
+ let mut registry_guard = match registry.write() {
2282
+ Ok(guard) => guard,
2283
+ Err(e) => {
2284
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2285
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
2286
+ return false;
2287
+ }
2288
+ };
2289
+
2290
+ match registry_guard.remove(name_str) {
2291
+ Ok(()) => true,
2292
+ Err(e) => {
2293
+ set_last_error(format!("Failed to remove PostProcessor: {}", e));
2294
+ false
2295
+ }
2296
+ }
2297
+ })
2298
+ }
2299
+
2300
+ /// Clear all registered PostProcessors.
2301
+ ///
2302
+ /// # Safety
2303
+ ///
2304
+ /// - Removes all registered processors. Subsequent extractions will run without them.
2305
+ /// - Returns true on success, false on error.
2306
+ #[unsafe(no_mangle)]
2307
+ pub unsafe extern "C" fn kreuzberg_clear_post_processors() -> bool {
2308
+ ffi_panic_guard_bool!("kreuzberg_clear_post_processors", {
2309
+ clear_last_error();
2310
+
2311
+ let registry = kreuzberg::plugins::registry::get_post_processor_registry();
2312
+ let mut registry_guard = match registry.write() {
2313
+ Ok(guard) => guard,
2314
+ Err(e) => {
2315
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2316
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
2317
+ return false;
2318
+ }
2319
+ };
2320
+
2321
+ *registry_guard = Default::default();
2322
+ true
2323
+ })
2324
+ }
2325
+
2326
+ /// List all registered PostProcessors as a JSON array of names.
2327
+ ///
2328
+ /// # Safety
2329
+ ///
2330
+ /// - Returned string must be freed with `kreuzberg_free_string`.
2331
+ /// - Returns NULL on error (check `kreuzberg_last_error`).
2332
+ #[unsafe(no_mangle)]
2333
+ pub unsafe extern "C" fn kreuzberg_list_post_processors() -> *mut c_char {
2334
+ ffi_panic_guard!("kreuzberg_list_post_processors", {
2335
+ clear_last_error();
2336
+
2337
+ let registry = kreuzberg::plugins::registry::get_post_processor_registry();
2338
+ let registry_guard = match registry.read() {
2339
+ Ok(guard) => guard,
2340
+ Err(e) => {
2341
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2342
+ set_last_error(format!("Failed to acquire registry read lock: {}", e));
2343
+ return ptr::null_mut();
2344
+ }
2345
+ };
2346
+
2347
+ match serde_json::to_string(&registry_guard.list()) {
2348
+ Ok(json) => match CString::new(json) {
2349
+ Ok(cstr) => cstr.into_raw(),
2350
+ Err(e) => {
2351
+ set_last_error(format!("Failed to create C string: {}", e));
2352
+ ptr::null_mut()
2353
+ }
2354
+ },
2355
+ Err(e) => {
2356
+ set_last_error(format!("Failed to serialize PostProcessor list: {}", e));
2357
+ ptr::null_mut()
2358
+ }
2359
+ }
2360
+ })
2361
+ }
2362
+
2363
+ /// Type alias for the DocumentExtractor callback function.
2364
+ ///
2365
+ /// # Parameters
2366
+ ///
2367
+ /// - `content`: Raw document bytes
2368
+ /// - `content_len`: Length of the content array
2369
+ /// - `mime_type`: MIME type of the document (null-terminated string)
2370
+ /// - `config_json`: JSON-encoded ExtractionConfig (null-terminated string)
2371
+ ///
2372
+ /// # Returns
2373
+ ///
2374
+ /// Null-terminated JSON string containing the ExtractionResult, or NULL on error.
2375
+ /// The returned string must be freeable by kreuzberg_free_string.
2376
+ ///
2377
+ /// # Safety
2378
+ ///
2379
+ /// The callback must:
2380
+ /// - Not store the content, mime_type, or config_json pointers (only valid during the call)
2381
+ /// - Return a valid null-terminated UTF-8 JSON string or NULL on error
2382
+ /// - The returned string must be freeable by kreuzberg_free_string
2383
+ type DocumentExtractorCallback = unsafe extern "C" fn(
2384
+ content: *const u8,
2385
+ content_len: usize,
2386
+ mime_type: *const c_char,
2387
+ config_json: *const c_char,
2388
+ ) -> *mut c_char;
2389
+
2390
+ /// FFI wrapper for custom DocumentExtractors registered from Java/C.
2391
+ ///
2392
+ /// This struct wraps a C function pointer and implements the DocumentExtractor trait,
2393
+ /// allowing custom extraction implementations from FFI languages to be registered
2394
+ /// and used within the Rust extraction pipeline.
2395
+ struct FfiDocumentExtractor {
2396
+ name: String,
2397
+ callback: DocumentExtractorCallback,
2398
+ #[allow(dead_code)]
2399
+ supported_types: Vec<String>,
2400
+ supported_types_static: Vec<&'static str>,
2401
+ priority: i32,
2402
+ }
2403
+
2404
+ impl FfiDocumentExtractor {
2405
+ fn new(name: String, callback: DocumentExtractorCallback, supported_types: Vec<String>, priority: i32) -> Self {
2406
+ let supported_types_static: Vec<&'static str> = supported_types
2407
+ .iter()
2408
+ .map(|s| {
2409
+ let leaked: &'static str = Box::leak(s.clone().into_boxed_str());
2410
+ leaked
2411
+ })
2412
+ .collect();
2413
+
2414
+ Self {
2415
+ name,
2416
+ callback,
2417
+ supported_types,
2418
+ supported_types_static,
2419
+ priority,
2420
+ }
2421
+ }
2422
+ }
2423
+
2424
+ impl Plugin for FfiDocumentExtractor {
2425
+ fn name(&self) -> &str {
2426
+ &self.name
2427
+ }
2428
+
2429
+ fn version(&self) -> String {
2430
+ "ffi-1.0.0".to_string()
2431
+ }
2432
+
2433
+ fn initialize(&self) -> Result<()> {
2434
+ Ok(())
2435
+ }
2436
+
2437
+ fn shutdown(&self) -> Result<()> {
2438
+ Ok(())
2439
+ }
2440
+ }
2441
+
2442
+ #[async_trait]
2443
+ impl kreuzberg::plugins::DocumentExtractor for FfiDocumentExtractor {
2444
+ async fn extract_bytes(
2445
+ &self,
2446
+ content: &[u8],
2447
+ mime_type: &str,
2448
+ config: &ExtractionConfig,
2449
+ ) -> Result<ExtractionResult> {
2450
+ let config_json = serde_json::to_string(config).map_err(|e| KreuzbergError::Validation {
2451
+ message: format!("Failed to serialize ExtractionConfig: {}", e),
2452
+ source: Some(Box::new(e)),
2453
+ })?;
2454
+
2455
+ let callback = self.callback;
2456
+ let extractor_name = self.name.clone();
2457
+ let extractor_name_error = self.name.clone();
2458
+ let extractor_name_parse = self.name.clone();
2459
+ let content_vec = content.to_vec();
2460
+ let mime_type_owned = mime_type.to_string();
2461
+ let config_json_owned = config_json.clone();
2462
+
2463
+ let result_json = tokio::task::spawn_blocking(move || {
2464
+ let mime_cstr = match CString::new(mime_type_owned.clone()) {
2465
+ Ok(s) => s,
2466
+ Err(e) => {
2467
+ return Err(KreuzbergError::Validation {
2468
+ message: format!("Invalid MIME type for extractor '{}': {}", extractor_name, e),
2469
+ source: Some(Box::new(e)),
2470
+ });
2471
+ }
2472
+ };
2473
+
2474
+ let config_cstr = match CString::new(config_json_owned.clone()) {
2475
+ Ok(s) => s,
2476
+ Err(e) => {
2477
+ return Err(KreuzbergError::Validation {
2478
+ message: format!("Invalid config JSON for extractor '{}': {}", extractor_name, e),
2479
+ source: Some(Box::new(e)),
2480
+ });
2481
+ }
2482
+ };
2483
+
2484
+ let result_ptr = unsafe {
2485
+ callback(
2486
+ content_vec.as_ptr(),
2487
+ content_vec.len(),
2488
+ mime_cstr.as_ptr(),
2489
+ config_cstr.as_ptr(),
2490
+ )
2491
+ };
2492
+
2493
+ if result_ptr.is_null() {
2494
+ return Err(KreuzbergError::Parsing {
2495
+ message: format!("DocumentExtractor '{}' returned NULL (callback failed)", extractor_name),
2496
+ source: None,
2497
+ });
2498
+ }
2499
+
2500
+ let result_cstr = unsafe { CString::from_raw(result_ptr) };
2501
+ let result_str = result_cstr.to_str().map_err(|e| KreuzbergError::Validation {
2502
+ message: format!("Invalid UTF-8 in result from extractor '{}': {}", extractor_name, e),
2503
+ source: Some(Box::new(e)),
2504
+ })?;
2505
+
2506
+ Ok(result_str.to_string())
2507
+ })
2508
+ .await
2509
+ .map_err(|e| {
2510
+ KreuzbergError::Other(format!(
2511
+ "Task join error in extractor '{}': {}",
2512
+ extractor_name_error, e
2513
+ ))
2514
+ })??;
2515
+
2516
+ serde_json::from_str(&result_json).map_err(|e| KreuzbergError::Parsing {
2517
+ message: format!(
2518
+ "Failed to deserialize ExtractionResult from extractor '{}': {}",
2519
+ extractor_name_parse, e
2520
+ ),
2521
+ source: Some(Box::new(e)),
2522
+ })
2523
+ }
2524
+
2525
+ async fn extract_file(
2526
+ &self,
2527
+ path: &std::path::Path,
2528
+ mime_type: &str,
2529
+ config: &ExtractionConfig,
2530
+ ) -> Result<ExtractionResult> {
2531
+ let content = tokio::fs::read(path).await.map_err(KreuzbergError::Io)?;
2532
+ self.extract_bytes(&content, mime_type, config).await
2533
+ }
2534
+
2535
+ fn supported_mime_types(&self) -> &[&str] {
2536
+ &self.supported_types_static
2537
+ }
2538
+
2539
+ fn priority(&self) -> i32 {
2540
+ self.priority
2541
+ }
2542
+ }
2543
+
2544
+ /// Register a custom DocumentExtractor via FFI callback.
2545
+ ///
2546
+ /// # Safety
2547
+ ///
2548
+ /// - `name` must be a valid null-terminated C string
2549
+ /// - `callback` must be a valid function pointer that:
2550
+ /// - Does not store the content, mime_type, or config_json pointers
2551
+ /// - Returns a null-terminated UTF-8 JSON string or NULL on error
2552
+ /// - The returned string must be freeable by kreuzberg_free_string
2553
+ /// - `mime_types` must be a valid null-terminated C string containing comma-separated MIME types
2554
+ /// - `priority` determines the order of selection (higher priority preferred)
2555
+ /// - Returns true on success, false on error (check kreuzberg_last_error)
2556
+ ///
2557
+ /// # Example (C)
2558
+ ///
2559
+ /// ```c
2560
+ /// char* my_extractor(const uint8_t* content, size_t len, const char* mime_type, const char* config) {
2561
+ /// // Extract content from bytes, return JSON ExtractionResult
2562
+ /// return strdup("{\"content\":\"extracted text\",\"mime_type\":\"text/plain\",\"metadata\":{}}");
2563
+ /// }
2564
+ ///
2565
+ /// bool success = kreuzberg_register_document_extractor(
2566
+ /// "my-extractor",
2567
+ /// my_extractor,
2568
+ /// "application/x-custom,text/x-custom",
2569
+ /// 100
2570
+ /// );
2571
+ /// if (!success) {
2572
+ /// const char* error = kreuzberg_last_error();
2573
+ /// printf("Failed to register: %s\n", error);
2574
+ /// }
2575
+ /// ```
2576
+ #[unsafe(no_mangle)]
2577
+ pub unsafe extern "C" fn kreuzberg_register_document_extractor(
2578
+ name: *const c_char,
2579
+ callback: DocumentExtractorCallback,
2580
+ mime_types: *const c_char,
2581
+ priority: i32,
2582
+ ) -> bool {
2583
+ ffi_panic_guard_bool!("kreuzberg_register_document_extractor", {
2584
+ clear_last_error();
2585
+
2586
+ if name.is_null() {
2587
+ set_last_error("DocumentExtractor name cannot be NULL".to_string());
2588
+ return false;
2589
+ }
2590
+
2591
+ if mime_types.is_null() {
2592
+ set_last_error("MIME types cannot be NULL".to_string());
2593
+ return false;
2594
+ }
2595
+
2596
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
2597
+ Ok(s) => s,
2598
+ Err(e) => {
2599
+ set_last_error(format!("Invalid UTF-8 in DocumentExtractor name: {}", e));
2600
+ return false;
2601
+ }
2602
+ };
2603
+
2604
+ if name_str.is_empty() {
2605
+ set_last_error("Plugin name cannot be empty".to_string());
2606
+ return false;
2607
+ }
2608
+
2609
+ if name_str.chars().any(|c| c.is_whitespace()) {
2610
+ set_last_error("Plugin name cannot contain whitespace".to_string());
2611
+ return false;
2612
+ }
2613
+
2614
+ let mime_types_str = match unsafe { CStr::from_ptr(mime_types) }.to_str() {
2615
+ Ok(s) => s,
2616
+ Err(e) => {
2617
+ set_last_error(format!("Invalid UTF-8 in MIME types: {}", e));
2618
+ return false;
2619
+ }
2620
+ };
2621
+
2622
+ let supported_types: Vec<String> = mime_types_str
2623
+ .split(',')
2624
+ .map(|s| s.trim().to_string())
2625
+ .filter(|s| !s.is_empty())
2626
+ .collect();
2627
+
2628
+ if supported_types.is_empty() {
2629
+ set_last_error("At least one MIME type must be specified".to_string());
2630
+ return false;
2631
+ }
2632
+
2633
+ let extractor = Arc::new(FfiDocumentExtractor::new(
2634
+ name_str.to_string(),
2635
+ callback,
2636
+ supported_types,
2637
+ priority,
2638
+ ));
2639
+
2640
+ let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
2641
+ let mut registry_guard = match registry.write() {
2642
+ Ok(guard) => guard,
2643
+ Err(e) => {
2644
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2645
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
2646
+ return false;
2647
+ }
2648
+ };
2649
+
2650
+ match registry_guard.register(extractor) {
2651
+ Ok(()) => true,
2652
+ Err(e) => {
2653
+ set_last_error(format!("Failed to register DocumentExtractor: {}", e));
2654
+ false
2655
+ }
2656
+ }
2657
+ })
2658
+ }
2659
+
2660
+ /// Unregister a DocumentExtractor by name.
2661
+ ///
2662
+ /// # Safety
2663
+ ///
2664
+ /// - `name` must be a valid null-terminated C string
2665
+ /// - Returns true on success, false on error (check kreuzberg_last_error)
2666
+ ///
2667
+ /// # Example (C)
2668
+ ///
2669
+ /// ```c
2670
+ /// bool success = kreuzberg_unregister_document_extractor("my-extractor");
2671
+ /// if (!success) {
2672
+ /// const char* error = kreuzberg_last_error();
2673
+ /// printf("Failed to unregister: %s\n", error);
2674
+ /// }
2675
+ /// ```
2676
+ #[unsafe(no_mangle)]
2677
+ pub unsafe extern "C" fn kreuzberg_unregister_document_extractor(name: *const c_char) -> bool {
2678
+ ffi_panic_guard_bool!("kreuzberg_unregister_document_extractor", {
2679
+ clear_last_error();
2680
+
2681
+ if name.is_null() {
2682
+ set_last_error("DocumentExtractor name cannot be NULL".to_string());
2683
+ return false;
2684
+ }
2685
+
2686
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
2687
+ Ok(s) => s,
2688
+ Err(e) => {
2689
+ set_last_error(format!("Invalid UTF-8 in DocumentExtractor name: {}", e));
2690
+ return false;
2691
+ }
2692
+ };
2693
+
2694
+ let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
2695
+ let mut registry_guard = match registry.write() {
2696
+ Ok(guard) => guard,
2697
+ Err(e) => {
2698
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2699
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
2700
+ return false;
2701
+ }
2702
+ };
2703
+
2704
+ match registry_guard.remove(name_str) {
2705
+ Ok(()) => true,
2706
+ Err(e) => {
2707
+ set_last_error(format!("Failed to remove DocumentExtractor: {}", e));
2708
+ false
2709
+ }
2710
+ }
2711
+ })
2712
+ }
2713
+
2714
+ /// List all registered DocumentExtractors as a JSON array of names.
2715
+ ///
2716
+ /// # Safety
2717
+ ///
2718
+ /// - Returned string must be freed with `kreuzberg_free_string`.
2719
+ /// - Returns NULL on error (check `kreuzberg_last_error`).
2720
+ #[unsafe(no_mangle)]
2721
+ pub unsafe extern "C" fn kreuzberg_list_document_extractors() -> *mut c_char {
2722
+ ffi_panic_guard!("kreuzberg_list_document_extractors", {
2723
+ clear_last_error();
2724
+
2725
+ let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
2726
+ let registry_guard = match registry.read() {
2727
+ Ok(guard) => guard,
2728
+ Err(e) => {
2729
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2730
+ set_last_error(format!("Failed to acquire registry read lock: {}", e));
2731
+ return ptr::null_mut();
2732
+ }
2733
+ };
2734
+
2735
+ match serde_json::to_string(&registry_guard.list()) {
2736
+ Ok(json) => match CString::new(json) {
2737
+ Ok(cstr) => cstr.into_raw(),
2738
+ Err(e) => {
2739
+ set_last_error(format!("Failed to create C string: {}", e));
2740
+ ptr::null_mut()
2741
+ }
2742
+ },
2743
+ Err(e) => {
2744
+ set_last_error(format!("Failed to serialize DocumentExtractor list: {}", e));
2745
+ ptr::null_mut()
2746
+ }
2747
+ }
2748
+ })
2749
+ }
2750
+
2751
+ /// Type alias for the Validator callback function.
2752
+ ///
2753
+ /// # Parameters
2754
+ ///
2755
+ /// - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
2756
+ ///
2757
+ /// # Returns
2758
+ ///
2759
+ /// Null-terminated error message string if validation fails (must be freed by Rust
2760
+ /// via kreuzberg_free_string), or NULL if validation passes.
2761
+ ///
2762
+ /// # Safety
2763
+ ///
2764
+ /// The callback must:
2765
+ /// - Not store the result_json pointer (it's only valid for the duration of the call)
2766
+ /// - Return a valid null-terminated UTF-8 string (error message) if validation fails
2767
+ /// - Return NULL if validation passes
2768
+ /// - The returned string must be freeable by kreuzberg_free_string
2769
+ type ValidatorCallback = unsafe extern "C" fn(result_json: *const c_char) -> *mut c_char;
2770
+
2771
+ /// FFI wrapper for custom Validators registered from Java/C.
2772
+ ///
2773
+ /// This struct wraps a C function pointer and implements the Validator trait,
2774
+ /// allowing custom validation implementations from FFI languages to be registered
2775
+ /// and used within the Rust extraction pipeline.
2776
+ struct FfiValidator {
2777
+ name: String,
2778
+ callback: ValidatorCallback,
2779
+ priority: i32,
2780
+ }
2781
+
2782
+ impl FfiValidator {
2783
+ fn new(name: String, callback: ValidatorCallback, priority: i32) -> Self {
2784
+ Self {
2785
+ name,
2786
+ callback,
2787
+ priority,
2788
+ }
2789
+ }
2790
+ }
2791
+
2792
+ impl Plugin for FfiValidator {
2793
+ fn name(&self) -> &str {
2794
+ &self.name
2795
+ }
2796
+
2797
+ fn version(&self) -> String {
2798
+ "ffi-1.0.0".to_string()
2799
+ }
2800
+
2801
+ fn initialize(&self) -> Result<()> {
2802
+ Ok(())
2803
+ }
2804
+
2805
+ fn shutdown(&self) -> Result<()> {
2806
+ Ok(())
2807
+ }
2808
+ }
2809
+
2810
+ #[async_trait]
2811
+ impl kreuzberg::plugins::Validator for FfiValidator {
2812
+ fn priority(&self) -> i32 {
2813
+ self.priority
2814
+ }
2815
+
2816
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
2817
+ let result_json = serde_json::to_string(result).map_err(|e| KreuzbergError::Validation {
2818
+ message: format!("Failed to serialize ExtractionResult: {}", e),
2819
+ source: Some(Box::new(e)),
2820
+ })?;
2821
+
2822
+ let callback = self.callback;
2823
+ let validator_name = self.name.clone();
2824
+ let result_json_owned = result_json.clone();
2825
+
2826
+ let error_msg = tokio::task::spawn_blocking(move || {
2827
+ let result_cstring = CString::new(result_json_owned).map_err(|e| KreuzbergError::Validation {
2828
+ message: format!("Failed to create C string from result JSON: {}", e),
2829
+ source: Some(Box::new(e)),
2830
+ })?;
2831
+
2832
+ let error_ptr = unsafe { callback(result_cstring.as_ptr()) };
2833
+
2834
+ if error_ptr.is_null() {
2835
+ return Ok::<Option<String>, KreuzbergError>(None);
2836
+ }
2837
+
2838
+ let error_cstr = unsafe { CStr::from_ptr(error_ptr) };
2839
+ let error_msg = error_cstr
2840
+ .to_str()
2841
+ .map_err(|e| KreuzbergError::Plugin {
2842
+ message: format!("Validator returned invalid UTF-8: {}", e),
2843
+ plugin_name: validator_name.clone(),
2844
+ })?
2845
+ .to_string();
2846
+
2847
+ unsafe { kreuzberg_free_string(error_ptr) };
2848
+
2849
+ Ok(Some(error_msg))
2850
+ })
2851
+ .await
2852
+ .map_err(|e| KreuzbergError::Plugin {
2853
+ message: format!("Validator task panicked: {}", e),
2854
+ plugin_name: self.name.clone(),
2855
+ })??;
2856
+
2857
+ if let Some(msg) = error_msg {
2858
+ return Err(KreuzbergError::Validation {
2859
+ message: msg,
2860
+ source: None,
2861
+ });
2862
+ }
2863
+
2864
+ Ok(())
2865
+ }
2866
+ }
2867
+
2868
+ /// Register a custom Validator via FFI callback.
2869
+ ///
2870
+ /// # Safety
2871
+ ///
2872
+ /// - `name` must be a valid null-terminated C string
2873
+ /// - `callback` must be a valid function pointer that:
2874
+ /// - Does not store the result_json pointer
2875
+ /// - Returns a null-terminated UTF-8 string (error message) if validation fails
2876
+ /// - Returns NULL if validation passes
2877
+ /// - The returned string must be freeable by kreuzberg_free_string
2878
+ /// - `priority` determines the order of validation (higher priority runs first)
2879
+ /// - Returns true on success, false on error (check kreuzberg_last_error)
2880
+ ///
2881
+ /// # Example (C)
2882
+ ///
2883
+ /// ```c
2884
+ /// char* my_validator(const char* result_json) {
2885
+ /// // Parse result_json, validate it
2886
+ /// // Return error message if validation fails, NULL if passes
2887
+ /// if (invalid) {
2888
+ /// return strdup("Validation failed: content too short");
2889
+ /// }
2890
+ /// return NULL;
2891
+ /// }
2892
+ ///
2893
+ /// bool success = kreuzberg_register_validator("my-validator", my_validator, 100);
2894
+ /// if (!success) {
2895
+ /// const char* error = kreuzberg_last_error();
2896
+ /// printf("Failed to register: %s\n", error);
2897
+ /// }
2898
+ /// ```
2899
+ #[unsafe(no_mangle)]
2900
+ pub unsafe extern "C" fn kreuzberg_register_validator(
2901
+ name: *const c_char,
2902
+ callback: ValidatorCallback,
2903
+ priority: i32,
2904
+ ) -> bool {
2905
+ ffi_panic_guard_bool!("kreuzberg_register_validator", {
2906
+ clear_last_error();
2907
+
2908
+ if name.is_null() {
2909
+ set_last_error("Validator name cannot be NULL".to_string());
2910
+ return false;
2911
+ }
2912
+
2913
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
2914
+ Ok(s) => s,
2915
+ Err(e) => {
2916
+ set_last_error(format!("Invalid UTF-8 in Validator name: {}", e));
2917
+ return false;
2918
+ }
2919
+ };
2920
+
2921
+ if name_str.is_empty() {
2922
+ set_last_error("Plugin name cannot be empty".to_string());
2923
+ return false;
2924
+ }
2925
+
2926
+ if name_str.chars().any(|c| c.is_whitespace()) {
2927
+ set_last_error("Plugin name cannot contain whitespace".to_string());
2928
+ return false;
2929
+ }
2930
+
2931
+ let validator = Arc::new(FfiValidator::new(name_str.to_string(), callback, priority));
2932
+
2933
+ let registry = kreuzberg::plugins::registry::get_validator_registry();
2934
+ let mut registry_guard = match registry.write() {
2935
+ Ok(guard) => guard,
2936
+ Err(e) => {
2937
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2938
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
2939
+ return false;
2940
+ }
2941
+ };
2942
+
2943
+ match registry_guard.register(validator) {
2944
+ Ok(()) => true,
2945
+ Err(e) => {
2946
+ set_last_error(format!("Failed to register Validator: {}", e));
2947
+ false
2948
+ }
2949
+ }
2950
+ })
2951
+ }
2952
+
2953
+ /// Unregister a Validator by name.
2954
+ ///
2955
+ /// # Safety
2956
+ ///
2957
+ /// - `name` must be a valid null-terminated C string
2958
+ /// - Returns true on success, false on error (check kreuzberg_last_error)
2959
+ ///
2960
+ /// # Example (C)
2961
+ ///
2962
+ /// ```c
2963
+ /// bool success = kreuzberg_unregister_validator("my-validator");
2964
+ /// if (!success) {
2965
+ /// const char* error = kreuzberg_last_error();
2966
+ /// printf("Failed to unregister: %s\n", error);
2967
+ /// }
2968
+ /// ```
2969
+ #[unsafe(no_mangle)]
2970
+ pub unsafe extern "C" fn kreuzberg_unregister_validator(name: *const c_char) -> bool {
2971
+ ffi_panic_guard_bool!("kreuzberg_unregister_validator", {
2972
+ clear_last_error();
2973
+
2974
+ if name.is_null() {
2975
+ set_last_error("Validator name cannot be NULL".to_string());
2976
+ return false;
2977
+ }
2978
+
2979
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
2980
+ Ok(s) => s,
2981
+ Err(e) => {
2982
+ set_last_error(format!("Invalid UTF-8 in Validator name: {}", e));
2983
+ return false;
2984
+ }
2985
+ };
2986
+
2987
+ let registry = kreuzberg::plugins::registry::get_validator_registry();
2988
+ let mut registry_guard = match registry.write() {
2989
+ Ok(guard) => guard,
2990
+ Err(e) => {
2991
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
2992
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
2993
+ return false;
2994
+ }
2995
+ };
2996
+
2997
+ match registry_guard.remove(name_str) {
2998
+ Ok(()) => true,
2999
+ Err(e) => {
3000
+ set_last_error(format!("Failed to remove Validator: {}", e));
3001
+ false
3002
+ }
3003
+ }
3004
+ })
3005
+ }
3006
+
3007
+ /// Clear all registered Validators.
3008
+ ///
3009
+ /// # Safety
3010
+ ///
3011
+ /// - Removes all validators. Subsequent extractions will skip custom validation.
3012
+ /// - Returns true on success, false on error.
3013
+ #[unsafe(no_mangle)]
3014
+ pub unsafe extern "C" fn kreuzberg_clear_validators() -> bool {
3015
+ ffi_panic_guard_bool!("kreuzberg_clear_validators", {
3016
+ clear_last_error();
3017
+
3018
+ let registry = kreuzberg::plugins::registry::get_validator_registry();
3019
+ let mut registry_guard = match registry.write() {
3020
+ Ok(guard) => guard,
3021
+ Err(e) => {
3022
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
3023
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
3024
+ return false;
3025
+ }
3026
+ };
3027
+
3028
+ *registry_guard = Default::default();
3029
+ true
3030
+ })
3031
+ }
3032
+
3033
+ /// List all registered Validators as a JSON array of names.
3034
+ ///
3035
+ /// # Safety
3036
+ ///
3037
+ /// - Returned string must be freed with `kreuzberg_free_string`.
3038
+ /// - Returns NULL on error (check `kreuzberg_last_error`).
3039
+ #[unsafe(no_mangle)]
3040
+ pub unsafe extern "C" fn kreuzberg_list_validators() -> *mut c_char {
3041
+ ffi_panic_guard!("kreuzberg_list_validators", {
3042
+ clear_last_error();
3043
+
3044
+ let registry = kreuzberg::plugins::registry::get_validator_registry();
3045
+ let registry_guard = match registry.read() {
3046
+ Ok(guard) => guard,
3047
+ Err(e) => {
3048
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
3049
+ set_last_error(format!("Failed to acquire registry read lock: {}", e));
3050
+ return ptr::null_mut();
3051
+ }
3052
+ };
3053
+
3054
+ match serde_json::to_string(&registry_guard.list()) {
3055
+ Ok(json) => match CString::new(json) {
3056
+ Ok(cstr) => cstr.into_raw(),
3057
+ Err(e) => {
3058
+ set_last_error(format!("Failed to create C string: {}", e));
3059
+ ptr::null_mut()
3060
+ }
3061
+ },
3062
+ Err(e) => {
3063
+ set_last_error(format!("Failed to serialize Validator list: {}", e));
3064
+ ptr::null_mut()
3065
+ }
3066
+ }
3067
+ })
3068
+ }
3069
+
3070
+ /// Unregister an OCR backend by name.
3071
+ ///
3072
+ /// # Safety
3073
+ ///
3074
+ /// - `name` must be a valid null-terminated C string
3075
+ /// - Returns true on success, false on error (check kreuzberg_last_error)
3076
+ ///
3077
+ /// # Example (C)
3078
+ ///
3079
+ /// ```c
3080
+ /// bool success = kreuzberg_unregister_ocr_backend("custom-ocr");
3081
+ /// if (!success) {
3082
+ /// const char* error = kreuzberg_last_error();
3083
+ /// printf("Failed to unregister: %s\n", error);
3084
+ /// }
3085
+ /// ```
3086
+ #[unsafe(no_mangle)]
3087
+ pub unsafe extern "C" fn kreuzberg_unregister_ocr_backend(name: *const c_char) -> bool {
3088
+ ffi_panic_guard_bool!("kreuzberg_unregister_ocr_backend", {
3089
+ clear_last_error();
3090
+
3091
+ if name.is_null() {
3092
+ set_last_error("OCR backend name cannot be NULL".to_string());
3093
+ return false;
3094
+ }
3095
+
3096
+ let name_str = match unsafe { CStr::from_ptr(name) }.to_str() {
3097
+ Ok(s) => s,
3098
+ Err(e) => {
3099
+ set_last_error(format!("Invalid UTF-8 in OCR backend name: {}", e));
3100
+ return false;
3101
+ }
3102
+ };
3103
+
3104
+ if name_str.is_empty() {
3105
+ set_last_error("OCR backend name cannot be empty".to_string());
3106
+ return false;
3107
+ }
3108
+
3109
+ if name_str.chars().any(|c| c.is_whitespace()) {
3110
+ set_last_error("OCR backend name cannot contain whitespace".to_string());
3111
+ return false;
3112
+ }
3113
+
3114
+ match kreuzberg::plugins::unregister_ocr_backend(name_str) {
3115
+ Ok(()) => true,
3116
+ Err(e) => {
3117
+ set_last_error(e.to_string());
3118
+ false
3119
+ }
3120
+ }
3121
+ })
3122
+ }
3123
+
3124
+ /// List all registered OCR backends as a JSON array of names.
3125
+ ///
3126
+ /// # Safety
3127
+ ///
3128
+ /// - Returned string must be freed with `kreuzberg_free_string`.
3129
+ /// - Returns NULL on error (check `kreuzberg_last_error`).
3130
+ ///
3131
+ /// # Example (C)
3132
+ ///
3133
+ /// ```c
3134
+ /// char* backends = kreuzberg_list_ocr_backends();
3135
+ /// if (backends == NULL) {
3136
+ /// const char* error = kreuzberg_last_error();
3137
+ /// printf("Failed to list backends: %s\n", error);
3138
+ /// } else {
3139
+ /// printf("OCR backends: %s\n", backends);
3140
+ /// kreuzberg_free_string(backends);
3141
+ /// }
3142
+ /// ```
3143
+ #[unsafe(no_mangle)]
3144
+ pub unsafe extern "C" fn kreuzberg_list_ocr_backends() -> *mut c_char {
3145
+ ffi_panic_guard!("kreuzberg_list_ocr_backends", {
3146
+ clear_last_error();
3147
+
3148
+ match kreuzberg::plugins::list_ocr_backends() {
3149
+ Ok(backends) => match serde_json::to_string(&backends) {
3150
+ Ok(json) => match CString::new(json) {
3151
+ Ok(cstr) => cstr.into_raw(),
3152
+ Err(e) => {
3153
+ set_last_error(format!("Failed to create C string: {}", e));
3154
+ ptr::null_mut()
3155
+ }
3156
+ },
3157
+ Err(e) => {
3158
+ set_last_error(format!("Failed to serialize OCR backend list: {}", e));
3159
+ ptr::null_mut()
3160
+ }
3161
+ },
3162
+ Err(e) => {
3163
+ set_last_error(e.to_string());
3164
+ ptr::null_mut()
3165
+ }
3166
+ }
3167
+ })
3168
+ }
3169
+
3170
+ /// Clear all registered OCR backends.
3171
+ ///
3172
+ /// # Safety
3173
+ ///
3174
+ /// - Removes all registered OCR backends. Subsequent extractions will use only built-in backends.
3175
+ /// - Returns true on success, false on error.
3176
+ ///
3177
+ /// # Example (C)
3178
+ ///
3179
+ /// ```c
3180
+ /// bool success = kreuzberg_clear_ocr_backends();
3181
+ /// if (!success) {
3182
+ /// const char* error = kreuzberg_last_error();
3183
+ /// printf("Failed to clear OCR backends: %s\n", error);
3184
+ /// }
3185
+ /// ```
3186
+ #[unsafe(no_mangle)]
3187
+ pub unsafe extern "C" fn kreuzberg_clear_ocr_backends() -> bool {
3188
+ ffi_panic_guard_bool!("kreuzberg_clear_ocr_backends", {
3189
+ clear_last_error();
3190
+
3191
+ match kreuzberg::plugins::clear_ocr_backends() {
3192
+ Ok(()) => true,
3193
+ Err(e) => {
3194
+ set_last_error(e.to_string());
3195
+ false
3196
+ }
3197
+ }
3198
+ })
3199
+ }
3200
+
3201
+ /// Clear all registered DocumentExtractors.
3202
+ ///
3203
+ /// # Safety
3204
+ ///
3205
+ /// - Removes all registered extractors. Subsequent extractions will use only built-in extractors.
3206
+ /// - Returns true on success, false on error.
3207
+ ///
3208
+ /// # Example (C)
3209
+ ///
3210
+ /// ```c
3211
+ /// bool success = kreuzberg_clear_document_extractors();
3212
+ /// if (!success) {
3213
+ /// const char* error = kreuzberg_last_error();
3214
+ /// printf("Failed to clear document extractors: %s\n", error);
3215
+ /// }
3216
+ /// ```
3217
+ #[unsafe(no_mangle)]
3218
+ pub unsafe extern "C" fn kreuzberg_clear_document_extractors() -> bool {
3219
+ ffi_panic_guard_bool!("kreuzberg_clear_document_extractors", {
3220
+ clear_last_error();
3221
+
3222
+ let registry = kreuzberg::plugins::registry::get_document_extractor_registry();
3223
+ let mut registry_guard = match registry.write() {
3224
+ Ok(guard) => guard,
3225
+ Err(e) => {
3226
+ // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
3227
+ set_last_error(format!("Failed to acquire registry write lock: {}", e));
3228
+ return false;
3229
+ }
3230
+ };
3231
+
3232
+ *registry_guard = Default::default();
3233
+ true
3234
+ })
3235
+ }
3236
+
3237
+ /// Detect MIME type from raw bytes.
3238
+ ///
3239
+ /// # Safety
3240
+ ///
3241
+ /// - `bytes` must be a valid pointer to byte data
3242
+ /// - `len` must be the correct length of the byte array
3243
+ /// - The returned string must be freed with `kreuzberg_free_string`
3244
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
3245
+ ///
3246
+ /// # Example (C)
3247
+ ///
3248
+ /// ```c
3249
+ /// const char* pdf_bytes = "%PDF-1.4\n";
3250
+ /// char* mime = kreuzberg_detect_mime_type_from_bytes((const uint8_t*)pdf_bytes, strlen(pdf_bytes));
3251
+ /// if (mime == NULL) {
3252
+ /// const char* error = kreuzberg_last_error();
3253
+ /// printf("Failed to detect MIME type: %s\n", error);
3254
+ /// } else {
3255
+ /// printf("MIME type: %s\n", mime);
3256
+ /// kreuzberg_free_string(mime);
3257
+ /// }
3258
+ /// ```
3259
+ #[unsafe(no_mangle)]
3260
+ pub unsafe extern "C" fn kreuzberg_detect_mime_type_from_bytes(bytes: *const u8, len: usize) -> *mut c_char {
3261
+ ffi_panic_guard!("kreuzberg_detect_mime_type_from_bytes", {
3262
+ clear_last_error();
3263
+
3264
+ if bytes.is_null() {
3265
+ set_last_error("bytes cannot be NULL".to_string());
3266
+ return ptr::null_mut();
3267
+ }
3268
+
3269
+ let slice = unsafe { std::slice::from_raw_parts(bytes, len) };
3270
+
3271
+ match kreuzberg::core::mime::detect_mime_type_from_bytes(slice) {
3272
+ Ok(mime) => match string_to_c_string(mime) {
3273
+ Ok(ptr) => ptr,
3274
+ Err(e) => {
3275
+ set_last_error(e);
3276
+ ptr::null_mut()
3277
+ }
3278
+ },
3279
+ Err(e) => {
3280
+ set_last_error(e.to_string());
3281
+ ptr::null_mut()
3282
+ }
3283
+ }
3284
+ })
3285
+ }
3286
+
3287
+ /// Detect MIME type from file path (checks extension and reads file content).
3288
+ ///
3289
+ /// # Safety
3290
+ ///
3291
+ /// - `file_path` must be a valid null-terminated C string
3292
+ /// - The returned string must be freed with `kreuzberg_free_string`
3293
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
3294
+ ///
3295
+ /// # Example (C)
3296
+ ///
3297
+ /// ```c
3298
+ /// char* mime = kreuzberg_detect_mime_type_from_path("document.pdf");
3299
+ /// if (mime == NULL) {
3300
+ /// const char* error = kreuzberg_last_error();
3301
+ /// printf("Failed to detect MIME type: %s\n", error);
3302
+ /// } else {
3303
+ /// printf("MIME type: %s\n", mime);
3304
+ /// kreuzberg_free_string(mime);
3305
+ /// }
3306
+ /// ```
3307
+ #[unsafe(no_mangle)]
3308
+ pub unsafe extern "C" fn kreuzberg_detect_mime_type_from_path(file_path: *const c_char) -> *mut c_char {
3309
+ ffi_panic_guard!("kreuzberg_detect_mime_type_from_path", {
3310
+ clear_last_error();
3311
+
3312
+ if file_path.is_null() {
3313
+ set_last_error("file_path cannot be NULL".to_string());
3314
+ return ptr::null_mut();
3315
+ }
3316
+
3317
+ let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
3318
+ Ok(s) => s,
3319
+ Err(e) => {
3320
+ set_last_error(format!("Invalid UTF-8 in file path: {}", e));
3321
+ return ptr::null_mut();
3322
+ }
3323
+ };
3324
+
3325
+ match kreuzberg::core::mime::detect_mime_type(path_str, true) {
3326
+ Ok(mime) => match string_to_c_string(mime) {
3327
+ Ok(ptr) => ptr,
3328
+ Err(e) => {
3329
+ set_last_error(e);
3330
+ ptr::null_mut()
3331
+ }
3332
+ },
3333
+ Err(e) => {
3334
+ // ~keep: IO errors from file operations should bubble up as they indicate
3335
+ set_last_error(e.to_string());
3336
+ ptr::null_mut()
3337
+ }
3338
+ }
3339
+ })
3340
+ }
3341
+
3342
+ /// Get file extensions for a MIME type.
3343
+ ///
3344
+ /// # Safety
3345
+ ///
3346
+ /// - `mime_type` must be a valid null-terminated C string
3347
+ /// - The returned string is a JSON array of extensions (must be freed with `kreuzberg_free_string`)
3348
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
3349
+ ///
3350
+ /// # Example (C)
3351
+ ///
3352
+ /// ```c
3353
+ /// char* extensions = kreuzberg_get_extensions_for_mime("application/pdf");
3354
+ /// if (extensions == NULL) {
3355
+ /// const char* error = kreuzberg_last_error();
3356
+ /// printf("Failed to get extensions: %s\n", error);
3357
+ /// } else {
3358
+ /// printf("Extensions: %s\n", extensions);
3359
+ /// kreuzberg_free_string(extensions);
3360
+ /// }
3361
+ /// ```
3362
+ #[unsafe(no_mangle)]
3363
+ pub unsafe extern "C" fn kreuzberg_get_extensions_for_mime(mime_type: *const c_char) -> *mut c_char {
3364
+ ffi_panic_guard!("kreuzberg_get_extensions_for_mime", {
3365
+ clear_last_error();
3366
+
3367
+ if mime_type.is_null() {
3368
+ set_last_error("mime_type cannot be NULL".to_string());
3369
+ return ptr::null_mut();
3370
+ }
3371
+
3372
+ let mime_str = match unsafe { CStr::from_ptr(mime_type) }.to_str() {
3373
+ Ok(s) => s,
3374
+ Err(e) => {
3375
+ set_last_error(format!("Invalid UTF-8 in MIME type: {}", e));
3376
+ return ptr::null_mut();
3377
+ }
3378
+ };
3379
+
3380
+ match kreuzberg::core::mime::get_extensions_for_mime(mime_str) {
3381
+ Ok(extensions) => match serde_json::to_string(&extensions) {
3382
+ Ok(json) => match string_to_c_string(json) {
3383
+ Ok(ptr) => ptr,
3384
+ Err(e) => {
3385
+ set_last_error(e);
3386
+ ptr::null_mut()
3387
+ }
3388
+ },
3389
+ Err(e) => {
3390
+ set_last_error(format!("Failed to serialize extensions: {}", e));
3391
+ ptr::null_mut()
3392
+ }
3393
+ },
3394
+ Err(e) => {
3395
+ set_last_error(e.to_string());
3396
+ ptr::null_mut()
3397
+ }
3398
+ }
3399
+ })
3400
+ }
3401
+
3402
+ /// Load an ExtractionConfig from a file.
3403
+ ///
3404
+ /// Automatically detects the file format based on extension:
3405
+ /// - `.toml` - TOML format
3406
+ /// - `.yaml`, `.yml` - YAML format
3407
+ /// - `.json` - JSON format
3408
+ ///
3409
+ /// # Safety
3410
+ ///
3411
+ /// - `path` must be a valid null-terminated C string representing a file path
3412
+ /// - Returns a pointer to ExtractionConfig on success, NULL on error
3413
+ /// - The returned config must be freed with `kreuzberg_free_config`
3414
+ /// - Check `kreuzberg_last_error` on NULL return
3415
+ ///
3416
+ /// # Example (C)
3417
+ ///
3418
+ /// ```c
3419
+ /// ExtractionConfig* config = kreuzberg_config_from_file("kreuzberg.toml");
3420
+ /// if (config == NULL) {
3421
+ /// const char* error = kreuzberg_last_error();
3422
+ /// printf("Failed to load config: %s\n", error);
3423
+ /// return 1;
3424
+ /// }
3425
+ ///
3426
+ /// // Use config...
3427
+ /// char* result = kreuzberg_extract_file_with_config_sync("document.pdf", config);
3428
+ ///
3429
+ /// kreuzberg_free_config(config);
3430
+ /// ```
3431
+ #[unsafe(no_mangle)]
3432
+ pub unsafe extern "C" fn kreuzberg_config_from_file(path: *const c_char) -> *mut ExtractionConfig {
3433
+ ffi_panic_guard!("kreuzberg_config_from_file", {
3434
+ clear_last_error();
3435
+
3436
+ if path.is_null() {
3437
+ set_last_error("Config path cannot be NULL".to_string());
3438
+ return ptr::null_mut();
3439
+ }
3440
+
3441
+ let path_str = match unsafe { CStr::from_ptr(path) }.to_str() {
3442
+ Ok(s) => s,
3443
+ Err(e) => {
3444
+ set_last_error(format!("Invalid UTF-8 in config path: {}", e));
3445
+ return ptr::null_mut();
3446
+ }
3447
+ };
3448
+
3449
+ let path_buf = Path::new(path_str);
3450
+
3451
+ match ExtractionConfig::from_file(path_buf) {
3452
+ Ok(config) => Box::into_raw(Box::new(config)),
3453
+ Err(e) => {
3454
+ // ~keep: IO errors from file operations should bubble up as they indicate
3455
+ match &e {
3456
+ KreuzbergError::Io(io_err) => {
3457
+ set_last_error(format!("IO error loading config: {}", io_err));
3458
+ }
3459
+ _ => {
3460
+ set_last_error(format!("Failed to load config from file: {}", e));
3461
+ }
3462
+ }
3463
+ ptr::null_mut()
3464
+ }
3465
+ }
3466
+ })
3467
+ }
3468
+
3469
+ /// Discover and load an ExtractionConfig by searching parent directories.
3470
+ ///
3471
+ /// Searches the current directory and all parent directories for:
3472
+ /// - `kreuzberg.toml`
3473
+ /// - `kreuzberg.yaml`
3474
+ /// - `kreuzberg.yml`
3475
+ /// - `kreuzberg.json`
3476
+ ///
3477
+ /// Returns the first config file found as JSON, or NULL if none found.
3478
+ ///
3479
+ /// # Safety
3480
+ ///
3481
+ /// - The returned string must be freed with `kreuzberg_free_string`
3482
+ /// - Returns NULL if no config found or on error (check `kreuzberg_last_error`)
3483
+ ///
3484
+ /// # Example (C)
3485
+ ///
3486
+ /// ```c
3487
+ /// char* config_json = kreuzberg_config_discover();
3488
+ /// if (config_json == NULL) {
3489
+ /// const char* error = kreuzberg_last_error();
3490
+ /// if (error != NULL && strlen(error) > 0) {
3491
+ /// printf("Error discovering config: %s\n", error);
3492
+ /// return 1;
3493
+ /// }
3494
+ /// // No config found, use defaults
3495
+ /// printf("No config file found\n");
3496
+ /// } else {
3497
+ /// printf("Config: %s\n", config_json);
3498
+ /// kreuzberg_free_string(config_json);
3499
+ /// }
3500
+ /// ```
3501
+ #[unsafe(no_mangle)]
3502
+ pub unsafe extern "C" fn kreuzberg_config_discover() -> *mut c_char {
3503
+ ffi_panic_guard!("kreuzberg_config_discover", {
3504
+ clear_last_error();
3505
+
3506
+ match ExtractionConfig::discover() {
3507
+ Ok(Some(config)) => match serde_json::to_string(&config) {
3508
+ Ok(json) => match CString::new(json) {
3509
+ Ok(cstr) => cstr.into_raw(),
3510
+ Err(e) => {
3511
+ set_last_error(format!("Failed to serialize config: {}", e));
3512
+ ptr::null_mut()
3513
+ }
3514
+ },
3515
+ Err(e) => {
3516
+ set_last_error(format!("Failed to serialize config: {}", e));
3517
+ ptr::null_mut()
3518
+ }
3519
+ },
3520
+ Ok(None) => ptr::null_mut(),
3521
+ Err(e) => {
3522
+ // ~keep: IO errors from directory traversal should bubble up as they indicate
3523
+ match &e {
3524
+ KreuzbergError::Io(io_err) => {
3525
+ set_last_error(format!("IO error discovering config: {}", io_err));
3526
+ }
3527
+ _ => {
3528
+ set_last_error(format!("Failed to discover config: {}", e));
3529
+ }
3530
+ }
3531
+ ptr::null_mut()
3532
+ }
3533
+ }
3534
+ })
3535
+ }
3536
+
3537
+ #[allow(non_upper_case_globals)]
3538
+ const _: () = {
3539
+ const fn assert_c_extraction_result_size() {
3540
+ const SIZE: usize = std::mem::size_of::<CExtractionResult>();
3541
+ const _: () = assert!(SIZE == 96, "CExtractionResult size must be 96 bytes");
3542
+ }
3543
+
3544
+ const fn assert_c_extraction_result_alignment() {
3545
+ const ALIGN: usize = std::mem::align_of::<CExtractionResult>();
3546
+ const _: () = assert!(ALIGN == 8, "CExtractionResult alignment must be 8 bytes");
3547
+ }
3548
+
3549
+ const fn assert_c_batch_result_size() {
3550
+ const SIZE: usize = std::mem::size_of::<CBatchResult>();
3551
+ const _: () = assert!(SIZE == 24, "CBatchResult size must be 24 bytes");
3552
+ }
3553
+
3554
+ const fn assert_c_batch_result_alignment() {
3555
+ const ALIGN: usize = std::mem::align_of::<CBatchResult>();
3556
+ const _: () = assert!(ALIGN == 8, "CBatchResult alignment must be 8 bytes");
3557
+ }
3558
+
3559
+ const fn assert_c_bytes_with_mime_size() {
3560
+ const SIZE: usize = std::mem::size_of::<CBytesWithMime>();
3561
+ const _: () = assert!(SIZE == 24, "CBytesWithMime size must be 24 bytes");
3562
+ }
3563
+
3564
+ const fn assert_c_bytes_with_mime_alignment() {
3565
+ const ALIGN: usize = std::mem::align_of::<CBytesWithMime>();
3566
+ const _: () = assert!(ALIGN == 8, "CBytesWithMime alignment must be 8 bytes");
3567
+ }
3568
+
3569
+ let _ = assert_c_extraction_result_size;
3570
+ let _ = assert_c_extraction_result_alignment;
3571
+ let _ = assert_c_batch_result_size;
3572
+ let _ = assert_c_batch_result_alignment;
3573
+ let _ = assert_c_bytes_with_mime_size;
3574
+ let _ = assert_c_bytes_with_mime_alignment;
3575
+ };
3576
+
3577
+ #[cfg(test)]
3578
+ mod tests {
3579
+ use super::*;
3580
+ use std::ffi::CString;
3581
+
3582
+ #[test]
3583
+ fn test_version() {
3584
+ unsafe {
3585
+ let version = kreuzberg_version();
3586
+ assert!(!version.is_null());
3587
+ let version_str = CStr::from_ptr(version).to_str().unwrap();
3588
+ assert!(!version_str.is_empty());
3589
+ }
3590
+ }
3591
+
3592
+ #[test]
3593
+ fn test_null_path() {
3594
+ unsafe {
3595
+ let result = kreuzberg_extract_file_sync(ptr::null());
3596
+ assert!(result.is_null());
3597
+
3598
+ let error = kreuzberg_last_error();
3599
+ assert!(!error.is_null());
3600
+ let error_str = CStr::from_ptr(error).to_str().unwrap();
3601
+ assert!(error_str.contains("NULL"));
3602
+ }
3603
+ }
3604
+
3605
+ #[test]
3606
+ fn test_nonexistent_file() {
3607
+ unsafe {
3608
+ let path = CString::new("/nonexistent/file.pdf").unwrap();
3609
+ let result = kreuzberg_extract_file_sync(path.as_ptr());
3610
+ assert!(result.is_null());
3611
+
3612
+ let error = kreuzberg_last_error();
3613
+ assert!(!error.is_null());
3614
+ }
3615
+ }
3616
+ }