kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +538 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +2 -105
  8. data/README.md +454 -454
  9. data/Rakefile +25 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -341
  12. data/ext/kreuzberg_rb/extconf.rb +45 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
  14. data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
  15. data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
  23. data/extconf.rb +28 -28
  24. data/kreuzberg.gemspec +214 -214
  25. data/lib/kreuzberg/api_proxy.rb +142 -142
  26. data/lib/kreuzberg/cache_api.rb +81 -81
  27. data/lib/kreuzberg/cli.rb +55 -55
  28. data/lib/kreuzberg/cli_proxy.rb +127 -127
  29. data/lib/kreuzberg/config.rb +724 -724
  30. data/lib/kreuzberg/error_context.rb +80 -80
  31. data/lib/kreuzberg/errors.rb +118 -118
  32. data/lib/kreuzberg/extraction_api.rb +340 -340
  33. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  36. data/lib/kreuzberg/result.rb +279 -279
  37. data/lib/kreuzberg/setup_lib_path.rb +80 -80
  38. data/lib/kreuzberg/validator_protocol.rb +89 -89
  39. data/lib/kreuzberg/version.rb +5 -5
  40. data/lib/kreuzberg.rb +109 -109
  41. data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
  42. data/sig/kreuzberg/internal.rbs +184 -184
  43. data/sig/kreuzberg.rbs +546 -546
  44. data/spec/binding/cache_spec.rb +227 -227
  45. data/spec/binding/cli_proxy_spec.rb +85 -85
  46. data/spec/binding/cli_spec.rb +55 -55
  47. data/spec/binding/config_spec.rb +345 -345
  48. data/spec/binding/config_validation_spec.rb +283 -283
  49. data/spec/binding/error_handling_spec.rb +213 -213
  50. data/spec/binding/errors_spec.rb +66 -66
  51. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  52. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  53. data/spec/binding/plugins/validator_spec.rb +274 -274
  54. data/spec/fixtures/config.toml +39 -39
  55. data/spec/fixtures/config.yaml +41 -41
  56. data/spec/fixtures/invalid_config.toml +4 -4
  57. data/spec/smoke/package_spec.rb +178 -178
  58. data/spec/spec_helper.rb +42 -42
  59. data/vendor/Cargo.toml +2 -1
  60. data/vendor/kreuzberg/Cargo.toml +2 -2
  61. data/vendor/kreuzberg/README.md +230 -230
  62. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
  63. data/vendor/kreuzberg/build.rs +843 -843
  64. data/vendor/kreuzberg/src/api/error.rs +81 -81
  65. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  66. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  67. data/vendor/kreuzberg/src/api/server.rs +353 -353
  68. data/vendor/kreuzberg/src/api/types.rs +170 -170
  69. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  70. data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
  71. data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
  72. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  73. data/vendor/kreuzberg/src/core/config.rs +1080 -1080
  74. data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
  75. data/vendor/kreuzberg/src/core/io.rs +329 -329
  76. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  77. data/vendor/kreuzberg/src/core/mod.rs +47 -47
  78. data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
  79. data/vendor/kreuzberg/src/embeddings.rs +500 -500
  80. data/vendor/kreuzberg/src/error.rs +431 -431
  81. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  82. data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
  83. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  84. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  85. data/vendor/kreuzberg/src/extraction/html.rs +601 -601
  86. data/vendor/kreuzberg/src/extraction/image.rs +491 -491
  87. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
  88. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
  89. data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
  90. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  91. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  92. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  93. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  94. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
  95. data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
  96. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  97. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  98. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  99. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  100. data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
  101. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
  102. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
  103. data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
  104. data/vendor/kreuzberg/src/extractors/email.rs +157 -157
  105. data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
  106. data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
  107. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
  108. data/vendor/kreuzberg/src/extractors/html.rs +407 -407
  109. data/vendor/kreuzberg/src/extractors/image.rs +219 -219
  110. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
  112. data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
  114. data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
  115. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  116. data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
  117. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
  118. data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
  119. data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
  120. data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
  121. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
  122. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  123. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  124. data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
  125. data/vendor/kreuzberg/src/extractors/text.rs +265 -265
  126. data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
  127. data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
  128. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  129. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  130. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  131. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  132. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  133. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  134. data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
  135. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  136. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  137. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  138. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
  139. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
  140. data/vendor/kreuzberg/src/lib.rs +113 -113
  141. data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
  142. data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
  143. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  144. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  145. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  146. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  147. data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
  148. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  149. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  150. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
  151. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  152. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  153. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  154. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  155. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
  156. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
  157. data/vendor/kreuzberg/src/pdf/error.rs +130 -130
  158. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  159. data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
  160. data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
  161. data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
  162. data/vendor/kreuzberg/src/pdf/table.rs +420 -420
  163. data/vendor/kreuzberg/src/pdf/text.rs +240 -240
  164. data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
  165. data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
  166. data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
  167. data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
  168. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
  169. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  170. data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
  171. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  172. data/vendor/kreuzberg/src/text/mod.rs +25 -25
  173. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  174. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
  175. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  176. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  177. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  178. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  179. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  180. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  181. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  182. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  183. data/vendor/kreuzberg/src/types.rs +1055 -1055
  184. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  185. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  186. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  187. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  188. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  189. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  190. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  191. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  192. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  193. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  194. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  195. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  196. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  198. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  199. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  200. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  201. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  202. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  203. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  204. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  205. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  206. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  207. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  208. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  209. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  210. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  211. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  212. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  213. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  214. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  215. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  216. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  217. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  218. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  219. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  220. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  221. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  222. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  223. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  224. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  225. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  226. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  227. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  228. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  229. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  230. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  231. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  232. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  233. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  234. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  235. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  236. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  237. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  238. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  239. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  240. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  241. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  242. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  243. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  244. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  245. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  246. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  247. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  248. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  249. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  250. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  251. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  252. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  253. data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
  254. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
  255. data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
  256. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  257. data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
  258. data/vendor/kreuzberg/tests/config_features.rs +612 -612
  259. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
  260. data/vendor/kreuzberg/tests/core_integration.rs +510 -510
  261. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  262. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
  263. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  264. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  265. data/vendor/kreuzberg/tests/email_integration.rs +327 -327
  266. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  267. data/vendor/kreuzberg/tests/error_handling.rs +402 -402
  268. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  269. data/vendor/kreuzberg/tests/format_integration.rs +164 -164
  270. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  271. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  272. data/vendor/kreuzberg/tests/image_integration.rs +255 -255
  273. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  274. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  275. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  276. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  277. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  278. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  279. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  280. data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
  281. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
  282. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
  283. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
  284. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  285. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
  286. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  287. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  288. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
  289. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
  290. data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
  291. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
  292. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
  293. data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
  294. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  295. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
  296. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
  297. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
  298. data/vendor/kreuzberg/tests/security_validation.rs +416 -416
  299. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  300. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
  301. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
  302. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
  303. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  304. data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
  305. data/vendor/kreuzberg-ffi/README.md +851 -851
  306. data/vendor/kreuzberg-ffi/build.rs +176 -176
  307. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
  308. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
  309. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
  310. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
  311. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
  312. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
  313. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
  314. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
  315. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
  316. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
  317. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
  318. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
  319. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
  320. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
  321. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  322. data/vendor/kreuzberg-tesseract/LICENSE +22 -22
  323. data/vendor/kreuzberg-tesseract/README.md +399 -399
  324. data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
  325. data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
  326. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
  327. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
  328. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
  329. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
  330. data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
  331. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
  332. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
  333. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
  334. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
  335. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
  336. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
  337. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
  338. data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
  339. data/vendor/rb-sys/Cargo.lock +393 -393
  340. data/vendor/rb-sys/Cargo.toml +70 -70
  341. data/vendor/rb-sys/Cargo.toml.orig +57 -57
  342. data/vendor/rb-sys/LICENSE-APACHE +190 -190
  343. data/vendor/rb-sys/LICENSE-MIT +21 -21
  344. data/vendor/rb-sys/build/features.rs +111 -111
  345. data/vendor/rb-sys/build/main.rs +286 -286
  346. data/vendor/rb-sys/build/stable_api_config.rs +155 -155
  347. data/vendor/rb-sys/build/version.rs +50 -50
  348. data/vendor/rb-sys/readme.md +36 -36
  349. data/vendor/rb-sys/src/bindings.rs +21 -21
  350. data/vendor/rb-sys/src/hidden.rs +11 -11
  351. data/vendor/rb-sys/src/lib.rs +35 -35
  352. data/vendor/rb-sys/src/macros.rs +371 -371
  353. data/vendor/rb-sys/src/memory.rs +53 -53
  354. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
  355. data/vendor/rb-sys/src/special_consts.rs +31 -31
  356. data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
  357. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
  358. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
  359. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
  360. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
  361. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
  362. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
  363. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
  364. data/vendor/rb-sys/src/stable_api.rs +260 -260
  365. data/vendor/rb-sys/src/symbol.rs +31 -31
  366. data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
  367. data/vendor/rb-sys/src/utils.rs +89 -89
  368. data/vendor/rb-sys/src/value_type.rs +7 -7
  369. metadata +7 -80
@@ -1,1087 +1,1087 @@
1
- /* Auto-generated C bindings for Kreuzberg */
2
-
3
- #ifndef KREUZBERG_FFI_H
4
- #define KREUZBERG_FFI_H
5
-
6
- #pragma once
7
-
8
- /* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */
9
-
10
- #include <stdarg.h>
11
- #include <stdbool.h>
12
- #include <stdint.h>
13
- #include <stdlib.h>
14
- /**
15
- * Opaque type for extraction configuration.
16
- * This is an opaque pointer type - callers should not access its internals.
17
- */
18
- typedef struct ExtractionConfig ExtractionConfig;
19
-
20
-
21
- /**
22
- * C-compatible extraction result structure
23
- *
24
- * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
25
- * Field order: 11 pointers (8 bytes each) + 1 bool + 7 bytes padding = 96 bytes total
26
- */
27
- typedef struct CExtractionResult {
28
- /**
29
- * Extracted text content (null-terminated UTF-8 string, must be freed with kreuzberg_free_string)
30
- */
31
- char *content;
32
- /**
33
- * Detected MIME type (null-terminated string, must be freed with kreuzberg_free_string)
34
- */
35
- char *mime_type;
36
- /**
37
- * Document language (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
38
- */
39
- char *language;
40
- /**
41
- * Document date (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
42
- */
43
- char *date;
44
- /**
45
- * Document subject (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
46
- */
47
- char *subject;
48
- /**
49
- * Tables as JSON array (null-terminated string, or NULL if no tables, must be freed with kreuzberg_free_string)
50
- */
51
- char *tables_json;
52
- /**
53
- * Detected languages as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
54
- */
55
- char *detected_languages_json;
56
- /**
57
- * Metadata as JSON object (null-terminated string, or NULL if no metadata, must be freed with kreuzberg_free_string)
58
- */
59
- char *metadata_json;
60
- /**
61
- * Text chunks as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
62
- */
63
- char *chunks_json;
64
- /**
65
- * Extracted images as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
66
- */
67
- char *images_json;
68
- /**
69
- * Page structure as JSON object (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
70
- */
71
- char *page_structure_json;
72
- /**
73
- * Whether extraction was successful
74
- */
75
- bool success;
76
- /**
77
- * Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
78
- */
79
- uint8_t _padding1[7];
80
- } CExtractionResult;
81
-
82
- /**
83
- * C-compatible structure for batch extraction results
84
- *
85
- * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
86
- * Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 bool + 7 bytes padding = 24 bytes total
87
- */
88
- typedef struct CBatchResult {
89
- /**
90
- * Array of extraction results
91
- */
92
- struct CExtractionResult **results;
93
- /**
94
- * Number of results
95
- */
96
- uintptr_t count;
97
- /**
98
- * Whether batch operation was successful
99
- */
100
- bool success;
101
- /**
102
- * Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
103
- */
104
- uint8_t _padding2[7];
105
- } CBatchResult;
106
-
107
- /**
108
- * C-compatible structure for passing byte array with MIME type in batch operations
109
- *
110
- * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
111
- * Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 pointer (8 bytes) = 24 bytes total
112
- */
113
- typedef struct CBytesWithMime {
114
- /**
115
- * Pointer to byte data
116
- */
117
- const uint8_t *data;
118
- /**
119
- * Length of byte data
120
- */
121
- uintptr_t data_len;
122
- /**
123
- * MIME type as null-terminated C string
124
- */
125
- const char *mime_type;
126
- } CBytesWithMime;
127
-
128
- /**
129
- * Type alias for the OCR backend callback function.
130
- *
131
- * # Parameters
132
- *
133
- * - `image_bytes`: Pointer to image data
134
- * - `image_length`: Length of image data in bytes
135
- * - `config_json`: JSON-encoded OcrConfig (null-terminated string)
136
- *
137
- * # Returns
138
- *
139
- * Null-terminated string containing extracted text (must be freed by Rust via kreuzberg_free_string),
140
- * or NULL on error.
141
- *
142
- * # Safety
143
- *
144
- * The callback must:
145
- * - Not store the image_bytes pointer (it's only valid for the duration of the call)
146
- * - Return a valid null-terminated UTF-8 string allocated by the caller
147
- * - Return NULL on error (error message should be retrievable separately)
148
- */
149
- typedef char *(*OcrBackendCallback)(const uint8_t *image_bytes,
150
- uintptr_t image_length,
151
- const char *config_json);
152
-
153
- /**
154
- * Type alias for the PostProcessor callback function.
155
- *
156
- * # Parameters
157
- *
158
- * - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
159
- *
160
- * # Returns
161
- *
162
- * Null-terminated JSON string containing the processed ExtractionResult
163
- * (must be freed by Rust via kreuzberg_free_string), or NULL on error.
164
- *
165
- * # Safety
166
- *
167
- * The callback must:
168
- * - Not store the result_json pointer (it's only valid for the duration of the call)
169
- * - Return a valid null-terminated UTF-8 JSON string allocated by the caller
170
- * - Return NULL on error (error message should be retrievable separately)
171
- */
172
- typedef char *(*PostProcessorCallback)(const char *result_json);
173
-
174
- /**
175
- * Type alias for the DocumentExtractor callback function.
176
- *
177
- * # Parameters
178
- *
179
- * - `content`: Raw document bytes
180
- * - `content_len`: Length of the content array
181
- * - `mime_type`: MIME type of the document (null-terminated string)
182
- * - `config_json`: JSON-encoded ExtractionConfig (null-terminated string)
183
- *
184
- * # Returns
185
- *
186
- * Null-terminated JSON string containing the ExtractionResult, or NULL on error.
187
- * The returned string must be freeable by kreuzberg_free_string.
188
- *
189
- * # Safety
190
- *
191
- * The callback must:
192
- * - Not store the content, mime_type, or config_json pointers (only valid during the call)
193
- * - Return a valid null-terminated UTF-8 JSON string or NULL on error
194
- * - The returned string must be freeable by kreuzberg_free_string
195
- */
196
- typedef char *(*DocumentExtractorCallback)(const uint8_t *content,
197
- uintptr_t content_len,
198
- const char *mime_type,
199
- const char *config_json);
200
-
201
- /**
202
- * Type alias for the Validator callback function.
203
- *
204
- * # Parameters
205
- *
206
- * - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
207
- *
208
- * # Returns
209
- *
210
- * Null-terminated error message string if validation fails (must be freed by Rust
211
- * via kreuzberg_free_string), or NULL if validation passes.
212
- *
213
- * # Safety
214
- *
215
- * The callback must:
216
- * - Not store the result_json pointer (it's only valid for the duration of the call)
217
- * - Return a valid null-terminated UTF-8 string (error message) if validation fails
218
- * - Return NULL if validation passes
219
- * - The returned string must be freeable by kreuzberg_free_string
220
- */
221
- typedef char *(*ValidatorCallback)(const char *result_json);
222
-
223
- /**
224
- * Extract text and metadata from a file (synchronous).
225
- *
226
- * # Safety
227
- *
228
- * - `file_path` must be a valid null-terminated C string
229
- * - The returned pointer must be freed with `kreuzberg_free_result`
230
- * - Returns NULL on error (check `kreuzberg_last_error` for details)
231
- *
232
- * # Example (C)
233
- *
234
- * ```c
235
- * const char* path = "/path/to/document.pdf";
236
- * CExtractionResult* result = kreuzberg_extract_file_sync(path);
237
- * if (result != NULL && result->success) {
238
- * printf("Content: %s\n", result->content);
239
- * printf("MIME: %s\n", result->mime_type);
240
- * kreuzberg_free_result(result);
241
- * } else {
242
- * const char* error = kreuzberg_last_error();
243
- * printf("Error: %s\n", error);
244
- * }
245
- * ```
246
- */
247
- struct CExtractionResult *kreuzberg_extract_file_sync(const char *file_path);
248
-
249
- /**
250
- * Detect MIME type from a file path.
251
- *
252
- * # Safety
253
- *
254
- * - `file_path` must be a valid null-terminated C string
255
- * - The returned string must be freed with `kreuzberg_free_string`
256
- * - Returns NULL on error (check `kreuzberg_last_error`)
257
- */
258
- char *kreuzberg_detect_mime_type(const char *file_path, bool check_exists);
259
-
260
- /**
261
- * Validate that a MIME type is supported by Kreuzberg.
262
- *
263
- * # Safety
264
- *
265
- * - `mime_type` must be a valid null-terminated C string
266
- * - The returned string must be freed with `kreuzberg_free_string`
267
- * - Returns NULL on error (check `kreuzberg_last_error`)
268
- */
269
- char *kreuzberg_validate_mime_type(const char *mime_type);
270
-
271
- /**
272
- * List available embedding preset names.
273
- *
274
- * # Safety
275
- *
276
- * - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
277
- * - Returns NULL on error (check `kreuzberg_last_error`)
278
- */
279
- char *kreuzberg_list_embedding_presets(void);
280
-
281
- /**
282
- * Get a specific embedding preset by name.
283
- *
284
- * # Safety
285
- *
286
- * - `name` must be a valid null-terminated C string
287
- * - Returned string is JSON object and must be freed with `kreuzberg_free_string`
288
- * - Returns NULL on error (check `kreuzberg_last_error`)
289
- */
290
- char *kreuzberg_get_embedding_preset(const char *name);
291
-
292
- /**
293
- * Extract text and metadata from a file with custom configuration (synchronous).
294
- *
295
- * # Safety
296
- *
297
- * - `file_path` must be a valid null-terminated C string
298
- * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
299
- * - The returned pointer must be freed with `kreuzberg_free_result`
300
- * - Returns NULL on error (check `kreuzberg_last_error` for details)
301
- *
302
- * # Example (C)
303
- *
304
- * ```c
305
- * const char* path = "/path/to/document.pdf";
306
- * const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
307
- * CExtractionResult* result = kreuzberg_extract_file_sync_with_config(path, config);
308
- * if (result != NULL && result->success) {
309
- * printf("Content: %s\n", result->content);
310
- * kreuzberg_free_result(result);
311
- * }
312
- * ```
313
- */
314
- struct CExtractionResult *kreuzberg_extract_file_sync_with_config(const char *file_path,
315
- const char *config_json);
316
-
317
- /**
318
- * Extract text and metadata from byte array (synchronous).
319
- *
320
- * # Safety
321
- *
322
- * - `data` must be a valid pointer to a byte array of length `data_len`
323
- * - `mime_type` must be a valid null-terminated C string
324
- * - The returned pointer must be freed with `kreuzberg_free_result`
325
- * - Returns NULL on error (check `kreuzberg_last_error` for details)
326
- *
327
- * # Example (C)
328
- *
329
- * ```c
330
- * const uint8_t* data = ...; // Document bytes
331
- * size_t len = ...; // Length of data
332
- * const char* mime = "application/pdf";
333
- * CExtractionResult* result = kreuzberg_extract_bytes_sync(data, len, mime);
334
- * if (result != NULL && result->success) {
335
- * printf("Content: %s\n", result->content);
336
- * kreuzberg_free_result(result);
337
- * } else {
338
- * const char* error = kreuzberg_last_error();
339
- * printf("Error: %s\n", error);
340
- * }
341
- * ```
342
- */
343
- struct CExtractionResult *kreuzberg_extract_bytes_sync(const uint8_t *data,
344
- uintptr_t data_len,
345
- const char *mime_type);
346
-
347
- /**
348
- * Extract text and metadata from byte array with custom configuration (synchronous).
349
- *
350
- * # Safety
351
- *
352
- * - `data` must be a valid pointer to a byte array of length `data_len`
353
- * - `mime_type` must be a valid null-terminated C string
354
- * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
355
- * - The returned pointer must be freed with `kreuzberg_free_result`
356
- * - Returns NULL on error (check `kreuzberg_last_error` for details)
357
- *
358
- * # Example (C)
359
- *
360
- * ```c
361
- * const uint8_t* data = ...; // Document bytes
362
- * size_t len = ...; // Length of data
363
- * const char* mime = "application/pdf";
364
- * const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
365
- * CExtractionResult* result = kreuzberg_extract_bytes_sync_with_config(data, len, mime, config);
366
- * if (result != NULL && result->success) {
367
- * printf("Content: %s\n", result->content);
368
- * kreuzberg_free_result(result);
369
- * }
370
- * ```
371
- */
372
- struct CExtractionResult *kreuzberg_extract_bytes_sync_with_config(const uint8_t *data,
373
- uintptr_t data_len,
374
- const char *mime_type,
375
- const char *config_json);
376
-
377
- /**
378
- * Batch extract text and metadata from multiple files (synchronous).
379
- *
380
- * # Safety
381
- *
382
- * - `file_paths` must be a valid pointer to an array of null-terminated C strings
383
- * - `count` must be the number of file paths in the array
384
- * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
385
- * - The returned pointer must be freed with `kreuzberg_free_batch_result`
386
- * - Returns NULL on error (check `kreuzberg_last_error` for details)
387
- */
388
- struct CBatchResult *kreuzberg_batch_extract_files_sync(const char *const *file_paths,
389
- uintptr_t count,
390
- const char *config_json);
391
-
392
- /**
393
- * Batch extract text and metadata from multiple byte arrays (synchronous).
394
- *
395
- * # Safety
396
- *
397
- * - `items` must be a valid pointer to an array of CBytesWithMime structures
398
- * - `count` must be the number of items in the array
399
- * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
400
- * - The returned pointer must be freed with `kreuzberg_free_batch_result`
401
- * - Returns NULL on error (check `kreuzberg_last_error` for details)
402
- */
403
- struct CBatchResult *kreuzberg_batch_extract_bytes_sync(const struct CBytesWithMime *items,
404
- uintptr_t count,
405
- const char *config_json);
406
-
407
- /**
408
- * Load an extraction configuration from a TOML/YAML/JSON file.
409
- *
410
- * # Safety
411
- *
412
- * - `file_path` must be a valid null-terminated C string
413
- * - The returned string must be freed with `kreuzberg_free_string`
414
- * - Returns NULL on error (check `kreuzberg_last_error`)
415
- */
416
- char *kreuzberg_load_extraction_config_from_file(const char *file_path);
417
-
418
- /**
419
- * Free a batch result returned by batch extraction functions.
420
- *
421
- * # Safety
422
- *
423
- * - `batch_result` must be a pointer previously returned by a batch extraction function
424
- * - `batch_result` can be NULL (no-op)
425
- * - `batch_result` must not be used after this call
426
- * - All results and strings within the batch result will be freed automatically
427
- */
428
- void kreuzberg_free_batch_result(struct CBatchResult *batch_result);
429
-
430
- /**
431
- * Free a string returned by Kreuzberg functions.
432
- *
433
- * # Safety
434
- *
435
- * - `s` must be a string previously returned by a Kreuzberg function
436
- * - `s` can be NULL (no-op)
437
- * - `s` must not be used after this call
438
- *
439
- * # Example (C)
440
- *
441
- * ```c
442
- * char* str = result->content;
443
- * kreuzberg_free_string(str);
444
- * // str is now invalid
445
- * ```
446
- */
447
- void kreuzberg_free_string(char *s);
448
-
449
- /**
450
- * Clone a null-terminated string using Rust's allocator.
451
- *
452
- * # Safety
453
- *
454
- * - `s` must be a valid null-terminated UTF-8 string
455
- * - Returned pointer must be freed with `kreuzberg_free_string`
456
- * - Returns NULL on error (check `kreuzberg_last_error`)
457
- */
458
- char *kreuzberg_clone_string(const char *s);
459
-
460
- /**
461
- * Free an extraction result returned by `kreuzberg_extract_file_sync`.
462
- *
463
- * # Safety
464
- *
465
- * - `result` must be a pointer previously returned by `kreuzberg_extract_file_sync`
466
- * - `result` can be NULL (no-op)
467
- * - `result` must not be used after this call
468
- * - All string fields within the result will be freed automatically
469
- *
470
- * # Example (C)
471
- *
472
- * ```c
473
- * CExtractionResult* result = kreuzberg_extract_file_sync(path);
474
- * // Use result...
475
- * kreuzberg_free_result(result);
476
- * // result is now invalid
477
- * ```
478
- */
479
- void kreuzberg_free_result(struct CExtractionResult *result);
480
-
481
- /**
482
- * Get the last error message from a failed operation.
483
- *
484
- * # Safety
485
- *
486
- * - Returns a static string that does not need to be freed
487
- * - Returns NULL if no error has occurred
488
- * - The returned string is valid until the next Kreuzberg function call on the same thread
489
- *
490
- * # Example (C)
491
- *
492
- * ```c
493
- * CExtractionResult* result = kreuzberg_extract_file_sync(path);
494
- * if (result == NULL) {
495
- * const char* error = kreuzberg_last_error();
496
- * if (error != NULL) {
497
- * printf("Error: %s\n", error);
498
- * }
499
- * }
500
- * ```
501
- */
502
- const char *kreuzberg_last_error(void);
503
-
504
- /**
505
- * Get the error code for the last error.
506
- *
507
- * Returns the error code as an i32. Error codes are defined in ErrorCode enum:
508
- * - 0: Success (no error)
509
- * - 1: GenericError
510
- * - 2: Panic
511
- * - 3: InvalidArgument
512
- * - 4: IoError
513
- * - 5: ParsingError
514
- * - 6: OcrError
515
- * - 7: MissingDependency
516
- *
517
- * # Safety
518
- *
519
- * This function is thread-safe and always safe to call.
520
- *
521
- * # Example (C)
522
- *
523
- * ```c
524
- * CExtractionResult* result = kreuzberg_extract_file_sync(path);
525
- * if (result == NULL) {
526
- * int32_t code = kreuzberg_last_error_code();
527
- * if (code == 2) {
528
- * // A panic occurred
529
- * }
530
- * }
531
- * ```
532
- */
533
- int32_t kreuzberg_last_error_code(void);
534
-
535
- /**
536
- * Get the panic context for the last error (if it was a panic).
537
- *
538
- * Returns a JSON string containing panic context information, or NULL if
539
- * the last error was not a panic.
540
- *
541
- * The JSON structure contains:
542
- * - file: Source file where panic occurred
543
- * - line: Line number
544
- * - function: Function name
545
- * - message: Panic message
546
- * - timestamp_secs: Unix timestamp (seconds since epoch)
547
- *
548
- * # Safety
549
- *
550
- * The returned string must be freed with kreuzberg_free_string().
551
- *
552
- * # Example (C)
553
- *
554
- * ```c
555
- * CExtractionResult* result = kreuzberg_extract_file_sync(path);
556
- * if (result == NULL && kreuzberg_last_error_code() == 2) {
557
- * const char* context = kreuzberg_last_panic_context();
558
- * if (context != NULL) {
559
- * printf("Panic context: %s\n", context);
560
- * kreuzberg_free_string((char*)context);
561
- * }
562
- * }
563
- * ```
564
- */
565
- char *kreuzberg_last_panic_context(void);
566
-
567
- /**
568
- * Get the library version string.
569
- *
570
- * # Safety
571
- *
572
- * - Returns a static string that does not need to be freed
573
- * - The returned string is always valid
574
- *
575
- * # Example (C)
576
- *
577
- * ```c
578
- * const char* version = kreuzberg_version();
579
- * printf("Kreuzberg version: %s\n", version);
580
- * ```
581
- */
582
- const char *kreuzberg_version(void);
583
-
584
- /**
585
- * Register a custom OCR backend via FFI callback.
586
- *
587
- * # Safety
588
- *
589
- * - `name` must be a valid null-terminated C string
590
- * - `callback` must be a valid function pointer that:
591
- * - Does not store the image_bytes pointer
592
- * - Returns a null-terminated UTF-8 string or NULL on error
593
- * - The returned string must be freeable by kreuzberg_free_string
594
- * - Returns true on success, false on error (check kreuzberg_last_error)
595
- *
596
- * # Example (C)
597
- *
598
- * ```c
599
- * char* my_ocr_backend(const uint8_t* image_bytes, size_t image_length, const char* config_json) {
600
- * // Implement OCR logic here
601
- * // Return allocated string with result, or NULL on error
602
- * return strdup("Extracted text");
603
- * }
604
- *
605
- * bool success = kreuzberg_register_ocr_backend("my-ocr", my_ocr_backend);
606
- * if (!success) {
607
- * const char* error = kreuzberg_last_error();
608
- * printf("Failed to register: %s\n", error);
609
- * }
610
- * ```
611
- */
612
- bool kreuzberg_register_ocr_backend(const char *name, OcrBackendCallback callback);
613
-
614
- /**
615
- * Register a custom OCR backend with explicit language support via FFI callback.
616
- *
617
- * # Safety
618
- *
619
- * - `languages_json` must be a null-terminated JSON array of language codes or NULL
620
- * - See `kreuzberg_register_ocr_backend` for additional safety notes.
621
- */
622
- bool kreuzberg_register_ocr_backend_with_languages(const char *name,
623
- OcrBackendCallback callback,
624
- const char *languages_json);
625
-
626
- /**
627
- * Register a custom PostProcessor via FFI callback.
628
- *
629
- * # Safety
630
- *
631
- * - `name` must be a valid null-terminated C string
632
- * - `callback` must be a valid function pointer that:
633
- * - Does not store the result_json pointer
634
- * - Returns a null-terminated UTF-8 JSON string or NULL on error
635
- * - The returned string must be freeable by kreuzberg_free_string
636
- * - `priority` determines the order of execution (higher priority runs first)
637
- * - Returns true on success, false on error (check kreuzberg_last_error)
638
- *
639
- * # Example (C)
640
- *
641
- * ```c
642
- * char* my_post_processor(const char* result_json) {
643
- * // Parse result_json, modify it, return JSON string
644
- * return strdup("{\"content\":\"PROCESSED\"}");
645
- * }
646
- *
647
- * bool success = kreuzberg_register_post_processor("my-processor", my_post_processor, 100);
648
- * if (!success) {
649
- * const char* error = kreuzberg_last_error();
650
- * printf("Failed to register: %s\n", error);
651
- * }
652
- * ```
653
- */
654
- bool kreuzberg_register_post_processor(const char *name,
655
- PostProcessorCallback callback,
656
- int32_t priority);
657
-
658
- /**
659
- * Register a custom PostProcessor with an explicit processing stage.
660
- *
661
- * # Safety
662
- *
663
- * - `name` must be a valid null-terminated C string
664
- * - `stage` must be a valid null-terminated C string containing "early", "middle", or "late"
665
- * - `callback` must be a valid function pointer that:
666
- * - Does not store the result_json pointer
667
- * - Returns a null-terminated UTF-8 JSON string or NULL on error
668
- * - The returned string must be freeable by kreuzberg_free_string
669
- * - `priority` determines the order of execution within the stage (higher priority runs first)
670
- * - Returns true on success, false on error (check kreuzberg_last_error)
671
- */
672
- bool kreuzberg_register_post_processor_with_stage(const char *name,
673
- PostProcessorCallback callback,
674
- int32_t priority,
675
- const char *stage);
676
-
677
- /**
678
- * Unregister a PostProcessor by name.
679
- *
680
- * # Safety
681
- *
682
- * - `name` must be a valid null-terminated C string
683
- * - Returns true on success, false on error (check kreuzberg_last_error)
684
- *
685
- * # Example (C)
686
- *
687
- * ```c
688
- * bool success = kreuzberg_unregister_post_processor("my-processor");
689
- * if (!success) {
690
- * const char* error = kreuzberg_last_error();
691
- * printf("Failed to unregister: %s\n", error);
692
- * }
693
- * ```
694
- */
695
- bool kreuzberg_unregister_post_processor(const char *name);
696
-
697
- /**
698
- * Clear all registered PostProcessors.
699
- *
700
- * # Safety
701
- *
702
- * - Removes all registered processors. Subsequent extractions will run without them.
703
- * - Returns true on success, false on error.
704
- */
705
- bool kreuzberg_clear_post_processors(void);
706
-
707
- /**
708
- * List all registered PostProcessors as a JSON array of names.
709
- *
710
- * # Safety
711
- *
712
- * - Returned string must be freed with `kreuzberg_free_string`.
713
- * - Returns NULL on error (check `kreuzberg_last_error`).
714
- */
715
- char *kreuzberg_list_post_processors(void);
716
-
717
- /**
718
- * Register a custom DocumentExtractor via FFI callback.
719
- *
720
- * # Safety
721
- *
722
- * - `name` must be a valid null-terminated C string
723
- * - `callback` must be a valid function pointer that:
724
- * - Does not store the content, mime_type, or config_json pointers
725
- * - Returns a null-terminated UTF-8 JSON string or NULL on error
726
- * - The returned string must be freeable by kreuzberg_free_string
727
- * - `mime_types` must be a valid null-terminated C string containing comma-separated MIME types
728
- * - `priority` determines the order of selection (higher priority preferred)
729
- * - Returns true on success, false on error (check kreuzberg_last_error)
730
- *
731
- * # Example (C)
732
- *
733
- * ```c
734
- * char* my_extractor(const uint8_t* content, size_t len, const char* mime_type, const char* config) {
735
- * // Extract content from bytes, return JSON ExtractionResult
736
- * return strdup("{\"content\":\"extracted text\",\"mime_type\":\"text/plain\",\"metadata\":{}}");
737
- * }
738
- *
739
- * bool success = kreuzberg_register_document_extractor(
740
- * "my-extractor",
741
- * my_extractor,
742
- * "application/x-custom,text/x-custom",
743
- * 100
744
- * );
745
- * if (!success) {
746
- * const char* error = kreuzberg_last_error();
747
- * printf("Failed to register: %s\n", error);
748
- * }
749
- * ```
750
- */
751
- bool kreuzberg_register_document_extractor(const char *name,
752
- DocumentExtractorCallback callback,
753
- const char *mime_types,
754
- int32_t priority);
755
-
756
- /**
757
- * Unregister a DocumentExtractor by name.
758
- *
759
- * # Safety
760
- *
761
- * - `name` must be a valid null-terminated C string
762
- * - Returns true on success, false on error (check kreuzberg_last_error)
763
- *
764
- * # Example (C)
765
- *
766
- * ```c
767
- * bool success = kreuzberg_unregister_document_extractor("my-extractor");
768
- * if (!success) {
769
- * const char* error = kreuzberg_last_error();
770
- * printf("Failed to unregister: %s\n", error);
771
- * }
772
- * ```
773
- */
774
- bool kreuzberg_unregister_document_extractor(const char *name);
775
-
776
- /**
777
- * List all registered DocumentExtractors as a JSON array of names.
778
- *
779
- * # Safety
780
- *
781
- * - Returned string must be freed with `kreuzberg_free_string`.
782
- * - Returns NULL on error (check `kreuzberg_last_error`).
783
- */
784
- char *kreuzberg_list_document_extractors(void);
785
-
786
- /**
787
- * Register a custom Validator via FFI callback.
788
- *
789
- * # Safety
790
- *
791
- * - `name` must be a valid null-terminated C string
792
- * - `callback` must be a valid function pointer that:
793
- * - Does not store the result_json pointer
794
- * - Returns a null-terminated UTF-8 string (error message) if validation fails
795
- * - Returns NULL if validation passes
796
- * - The returned string must be freeable by kreuzberg_free_string
797
- * - `priority` determines the order of validation (higher priority runs first)
798
- * - Returns true on success, false on error (check kreuzberg_last_error)
799
- *
800
- * # Example (C)
801
- *
802
- * ```c
803
- * char* my_validator(const char* result_json) {
804
- * // Parse result_json, validate it
805
- * // Return error message if validation fails, NULL if passes
806
- * if (invalid) {
807
- * return strdup("Validation failed: content too short");
808
- * }
809
- * return NULL;
810
- * }
811
- *
812
- * bool success = kreuzberg_register_validator("my-validator", my_validator, 100);
813
- * if (!success) {
814
- * const char* error = kreuzberg_last_error();
815
- * printf("Failed to register: %s\n", error);
816
- * }
817
- * ```
818
- */
819
- bool kreuzberg_register_validator(const char *name, ValidatorCallback callback, int32_t priority);
820
-
821
- /**
822
- * Unregister a Validator by name.
823
- *
824
- * # Safety
825
- *
826
- * - `name` must be a valid null-terminated C string
827
- * - Returns true on success, false on error (check kreuzberg_last_error)
828
- *
829
- * # Example (C)
830
- *
831
- * ```c
832
- * bool success = kreuzberg_unregister_validator("my-validator");
833
- * if (!success) {
834
- * const char* error = kreuzberg_last_error();
835
- * printf("Failed to unregister: %s\n", error);
836
- * }
837
- * ```
838
- */
839
- bool kreuzberg_unregister_validator(const char *name);
840
-
841
- /**
842
- * Clear all registered Validators.
843
- *
844
- * # Safety
845
- *
846
- * - Removes all validators. Subsequent extractions will skip custom validation.
847
- * - Returns true on success, false on error.
848
- */
849
- bool kreuzberg_clear_validators(void);
850
-
851
- /**
852
- * List all registered Validators as a JSON array of names.
853
- *
854
- * # Safety
855
- *
856
- * - Returned string must be freed with `kreuzberg_free_string`.
857
- * - Returns NULL on error (check `kreuzberg_last_error`).
858
- */
859
- char *kreuzberg_list_validators(void);
860
-
861
- /**
862
- * Unregister an OCR backend by name.
863
- *
864
- * # Safety
865
- *
866
- * - `name` must be a valid null-terminated C string
867
- * - Returns true on success, false on error (check kreuzberg_last_error)
868
- *
869
- * # Example (C)
870
- *
871
- * ```c
872
- * bool success = kreuzberg_unregister_ocr_backend("custom-ocr");
873
- * if (!success) {
874
- * const char* error = kreuzberg_last_error();
875
- * printf("Failed to unregister: %s\n", error);
876
- * }
877
- * ```
878
- */
879
- bool kreuzberg_unregister_ocr_backend(const char *name);
880
-
881
- /**
882
- * List all registered OCR backends as a JSON array of names.
883
- *
884
- * # Safety
885
- *
886
- * - Returned string must be freed with `kreuzberg_free_string`.
887
- * - Returns NULL on error (check `kreuzberg_last_error`).
888
- *
889
- * # Example (C)
890
- *
891
- * ```c
892
- * char* backends = kreuzberg_list_ocr_backends();
893
- * if (backends == NULL) {
894
- * const char* error = kreuzberg_last_error();
895
- * printf("Failed to list backends: %s\n", error);
896
- * } else {
897
- * printf("OCR backends: %s\n", backends);
898
- * kreuzberg_free_string(backends);
899
- * }
900
- * ```
901
- */
902
- char *kreuzberg_list_ocr_backends(void);
903
-
904
- /**
905
- * Clear all registered OCR backends.
906
- *
907
- * # Safety
908
- *
909
- * - Removes all registered OCR backends. Subsequent extractions will use only built-in backends.
910
- * - Returns true on success, false on error.
911
- *
912
- * # Example (C)
913
- *
914
- * ```c
915
- * bool success = kreuzberg_clear_ocr_backends();
916
- * if (!success) {
917
- * const char* error = kreuzberg_last_error();
918
- * printf("Failed to clear OCR backends: %s\n", error);
919
- * }
920
- * ```
921
- */
922
- bool kreuzberg_clear_ocr_backends(void);
923
-
924
- /**
925
- * Clear all registered DocumentExtractors.
926
- *
927
- * # Safety
928
- *
929
- * - Removes all registered extractors. Subsequent extractions will use only built-in extractors.
930
- * - Returns true on success, false on error.
931
- *
932
- * # Example (C)
933
- *
934
- * ```c
935
- * bool success = kreuzberg_clear_document_extractors();
936
- * if (!success) {
937
- * const char* error = kreuzberg_last_error();
938
- * printf("Failed to clear document extractors: %s\n", error);
939
- * }
940
- * ```
941
- */
942
- bool kreuzberg_clear_document_extractors(void);
943
-
944
- /**
945
- * Detect MIME type from raw bytes.
946
- *
947
- * # Safety
948
- *
949
- * - `bytes` must be a valid pointer to byte data
950
- * - `len` must be the correct length of the byte array
951
- * - The returned string must be freed with `kreuzberg_free_string`
952
- * - Returns NULL on error (check `kreuzberg_last_error`)
953
- *
954
- * # Example (C)
955
- *
956
- * ```c
957
- * const char* pdf_bytes = "%PDF-1.4\n";
958
- * char* mime = kreuzberg_detect_mime_type_from_bytes((const uint8_t*)pdf_bytes, strlen(pdf_bytes));
959
- * if (mime == NULL) {
960
- * const char* error = kreuzberg_last_error();
961
- * printf("Failed to detect MIME type: %s\n", error);
962
- * } else {
963
- * printf("MIME type: %s\n", mime);
964
- * kreuzberg_free_string(mime);
965
- * }
966
- * ```
967
- */
968
- char *kreuzberg_detect_mime_type_from_bytes(const uint8_t *bytes, uintptr_t len);
969
-
970
- /**
971
- * Detect MIME type from file path (checks extension and reads file content).
972
- *
973
- * # Safety
974
- *
975
- * - `file_path` must be a valid null-terminated C string
976
- * - The returned string must be freed with `kreuzberg_free_string`
977
- * - Returns NULL on error (check `kreuzberg_last_error`)
978
- *
979
- * # Example (C)
980
- *
981
- * ```c
982
- * char* mime = kreuzberg_detect_mime_type_from_path("document.pdf");
983
- * if (mime == NULL) {
984
- * const char* error = kreuzberg_last_error();
985
- * printf("Failed to detect MIME type: %s\n", error);
986
- * } else {
987
- * printf("MIME type: %s\n", mime);
988
- * kreuzberg_free_string(mime);
989
- * }
990
- * ```
991
- */
992
- char *kreuzberg_detect_mime_type_from_path(const char *file_path);
993
-
994
- /**
995
- * Get file extensions for a MIME type.
996
- *
997
- * # Safety
998
- *
999
- * - `mime_type` must be a valid null-terminated C string
1000
- * - The returned string is a JSON array of extensions (must be freed with `kreuzberg_free_string`)
1001
- * - Returns NULL on error (check `kreuzberg_last_error`)
1002
- *
1003
- * # Example (C)
1004
- *
1005
- * ```c
1006
- * char* extensions = kreuzberg_get_extensions_for_mime("application/pdf");
1007
- * if (extensions == NULL) {
1008
- * const char* error = kreuzberg_last_error();
1009
- * printf("Failed to get extensions: %s\n", error);
1010
- * } else {
1011
- * printf("Extensions: %s\n", extensions);
1012
- * kreuzberg_free_string(extensions);
1013
- * }
1014
- * ```
1015
- */
1016
- char *kreuzberg_get_extensions_for_mime(const char *mime_type);
1017
-
1018
- /**
1019
- * Load an ExtractionConfig from a file.
1020
- *
1021
- * Automatically detects the file format based on extension:
1022
- * - `.toml` - TOML format
1023
- * - `.yaml`, `.yml` - YAML format
1024
- * - `.json` - JSON format
1025
- *
1026
- * # Safety
1027
- *
1028
- * - `path` must be a valid null-terminated C string representing a file path
1029
- * - Returns a pointer to ExtractionConfig on success, NULL on error
1030
- * - The returned config must be freed with `kreuzberg_free_config`
1031
- * - Check `kreuzberg_last_error` on NULL return
1032
- *
1033
- * # Example (C)
1034
- *
1035
- * ```c
1036
- * ExtractionConfig* config = kreuzberg_config_from_file("kreuzberg.toml");
1037
- * if (config == NULL) {
1038
- * const char* error = kreuzberg_last_error();
1039
- * printf("Failed to load config: %s\n", error);
1040
- * return 1;
1041
- * }
1042
- *
1043
- * // Use config...
1044
- * char* result = kreuzberg_extract_file_with_config_sync("document.pdf", config);
1045
- *
1046
- * kreuzberg_free_config(config);
1047
- * ```
1048
- */
1049
- ExtractionConfig *kreuzberg_config_from_file(const char *path);
1050
-
1051
- /**
1052
- * Discover and load an ExtractionConfig by searching parent directories.
1053
- *
1054
- * Searches the current directory and all parent directories for:
1055
- * - `kreuzberg.toml`
1056
- * - `kreuzberg.yaml`
1057
- * - `kreuzberg.yml`
1058
- * - `kreuzberg.json`
1059
- *
1060
- * Returns the first config file found as JSON, or NULL if none found.
1061
- *
1062
- * # Safety
1063
- *
1064
- * - The returned string must be freed with `kreuzberg_free_string`
1065
- * - Returns NULL if no config found or on error (check `kreuzberg_last_error`)
1066
- *
1067
- * # Example (C)
1068
- *
1069
- * ```c
1070
- * char* config_json = kreuzberg_config_discover();
1071
- * if (config_json == NULL) {
1072
- * const char* error = kreuzberg_last_error();
1073
- * if (error != NULL && strlen(error) > 0) {
1074
- * printf("Error discovering config: %s\n", error);
1075
- * return 1;
1076
- * }
1077
- * // No config found, use defaults
1078
- * printf("No config file found\n");
1079
- * } else {
1080
- * printf("Config: %s\n", config_json);
1081
- * kreuzberg_free_string(config_json);
1082
- * }
1083
- * ```
1084
- */
1085
- char *kreuzberg_config_discover(void);
1086
-
1087
- #endif /* KREUZBERG_FFI_H */
1
+ /* Auto-generated C bindings for Kreuzberg */
2
+
3
+ #ifndef KREUZBERG_FFI_H
4
+ #define KREUZBERG_FFI_H
5
+
6
+ #pragma once
7
+
8
+ /* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */
9
+
10
+ #include <stdarg.h>
11
+ #include <stdbool.h>
12
+ #include <stdint.h>
13
+ #include <stdlib.h>
14
+ /**
15
+ * Opaque type for extraction configuration.
16
+ * This is an opaque pointer type - callers should not access its internals.
17
+ */
18
+ typedef struct ExtractionConfig ExtractionConfig;
19
+
20
+
21
+ /**
22
+ * C-compatible extraction result structure
23
+ *
24
+ * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
25
+ * Field order: 11 pointers (8 bytes each) + 1 bool + 7 bytes padding = 96 bytes total
26
+ */
27
+ typedef struct CExtractionResult {
28
+ /**
29
+ * Extracted text content (null-terminated UTF-8 string, must be freed with kreuzberg_free_string)
30
+ */
31
+ char *content;
32
+ /**
33
+ * Detected MIME type (null-terminated string, must be freed with kreuzberg_free_string)
34
+ */
35
+ char *mime_type;
36
+ /**
37
+ * Document language (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
38
+ */
39
+ char *language;
40
+ /**
41
+ * Document date (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
42
+ */
43
+ char *date;
44
+ /**
45
+ * Document subject (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
46
+ */
47
+ char *subject;
48
+ /**
49
+ * Tables as JSON array (null-terminated string, or NULL if no tables, must be freed with kreuzberg_free_string)
50
+ */
51
+ char *tables_json;
52
+ /**
53
+ * Detected languages as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
54
+ */
55
+ char *detected_languages_json;
56
+ /**
57
+ * Metadata as JSON object (null-terminated string, or NULL if no metadata, must be freed with kreuzberg_free_string)
58
+ */
59
+ char *metadata_json;
60
+ /**
61
+ * Text chunks as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
62
+ */
63
+ char *chunks_json;
64
+ /**
65
+ * Extracted images as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
66
+ */
67
+ char *images_json;
68
+ /**
69
+ * Page structure as JSON object (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
70
+ */
71
+ char *page_structure_json;
72
+ /**
73
+ * Whether extraction was successful
74
+ */
75
+ bool success;
76
+ /**
77
+ * Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
78
+ */
79
+ uint8_t _padding1[7];
80
+ } CExtractionResult;
81
+
82
+ /**
83
+ * C-compatible structure for batch extraction results
84
+ *
85
+ * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
86
+ * Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 bool + 7 bytes padding = 24 bytes total
87
+ */
88
+ typedef struct CBatchResult {
89
+ /**
90
+ * Array of extraction results
91
+ */
92
+ struct CExtractionResult **results;
93
+ /**
94
+ * Number of results
95
+ */
96
+ uintptr_t count;
97
+ /**
98
+ * Whether batch operation was successful
99
+ */
100
+ bool success;
101
+ /**
102
+ * Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
103
+ */
104
+ uint8_t _padding2[7];
105
+ } CBatchResult;
106
+
107
+ /**
108
+ * C-compatible structure for passing byte array with MIME type in batch operations
109
+ *
110
+ * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
111
+ * Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 pointer (8 bytes) = 24 bytes total
112
+ */
113
+ typedef struct CBytesWithMime {
114
+ /**
115
+ * Pointer to byte data
116
+ */
117
+ const uint8_t *data;
118
+ /**
119
+ * Length of byte data
120
+ */
121
+ uintptr_t data_len;
122
+ /**
123
+ * MIME type as null-terminated C string
124
+ */
125
+ const char *mime_type;
126
+ } CBytesWithMime;
127
+
128
+ /**
129
+ * Type alias for the OCR backend callback function.
130
+ *
131
+ * # Parameters
132
+ *
133
+ * - `image_bytes`: Pointer to image data
134
+ * - `image_length`: Length of image data in bytes
135
+ * - `config_json`: JSON-encoded OcrConfig (null-terminated string)
136
+ *
137
+ * # Returns
138
+ *
139
+ * Null-terminated string containing extracted text (must be freed by Rust via kreuzberg_free_string),
140
+ * or NULL on error.
141
+ *
142
+ * # Safety
143
+ *
144
+ * The callback must:
145
+ * - Not store the image_bytes pointer (it's only valid for the duration of the call)
146
+ * - Return a valid null-terminated UTF-8 string allocated by the caller
147
+ * - Return NULL on error (error message should be retrievable separately)
148
+ */
149
+ typedef char *(*OcrBackendCallback)(const uint8_t *image_bytes,
150
+ uintptr_t image_length,
151
+ const char *config_json);
152
+
153
+ /**
154
+ * Type alias for the PostProcessor callback function.
155
+ *
156
+ * # Parameters
157
+ *
158
+ * - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
159
+ *
160
+ * # Returns
161
+ *
162
+ * Null-terminated JSON string containing the processed ExtractionResult
163
+ * (must be freed by Rust via kreuzberg_free_string), or NULL on error.
164
+ *
165
+ * # Safety
166
+ *
167
+ * The callback must:
168
+ * - Not store the result_json pointer (it's only valid for the duration of the call)
169
+ * - Return a valid null-terminated UTF-8 JSON string allocated by the caller
170
+ * - Return NULL on error (error message should be retrievable separately)
171
+ */
172
+ typedef char *(*PostProcessorCallback)(const char *result_json);
173
+
174
+ /**
175
+ * Type alias for the DocumentExtractor callback function.
176
+ *
177
+ * # Parameters
178
+ *
179
+ * - `content`: Raw document bytes
180
+ * - `content_len`: Length of the content array
181
+ * - `mime_type`: MIME type of the document (null-terminated string)
182
+ * - `config_json`: JSON-encoded ExtractionConfig (null-terminated string)
183
+ *
184
+ * # Returns
185
+ *
186
+ * Null-terminated JSON string containing the ExtractionResult, or NULL on error.
187
+ * The returned string must be freeable by kreuzberg_free_string.
188
+ *
189
+ * # Safety
190
+ *
191
+ * The callback must:
192
+ * - Not store the content, mime_type, or config_json pointers (only valid during the call)
193
+ * - Return a valid null-terminated UTF-8 JSON string or NULL on error
194
+ * - The returned string must be freeable by kreuzberg_free_string
195
+ */
196
+ typedef char *(*DocumentExtractorCallback)(const uint8_t *content,
197
+ uintptr_t content_len,
198
+ const char *mime_type,
199
+ const char *config_json);
200
+
201
+ /**
202
+ * Type alias for the Validator callback function.
203
+ *
204
+ * # Parameters
205
+ *
206
+ * - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
207
+ *
208
+ * # Returns
209
+ *
210
+ * Null-terminated error message string if validation fails (must be freed by Rust
211
+ * via kreuzberg_free_string), or NULL if validation passes.
212
+ *
213
+ * # Safety
214
+ *
215
+ * The callback must:
216
+ * - Not store the result_json pointer (it's only valid for the duration of the call)
217
+ * - Return a valid null-terminated UTF-8 string (error message) if validation fails
218
+ * - Return NULL if validation passes
219
+ * - The returned string must be freeable by kreuzberg_free_string
220
+ */
221
+ typedef char *(*ValidatorCallback)(const char *result_json);
222
+
223
+ /**
224
+ * Extract text and metadata from a file (synchronous).
225
+ *
226
+ * # Safety
227
+ *
228
+ * - `file_path` must be a valid null-terminated C string
229
+ * - The returned pointer must be freed with `kreuzberg_free_result`
230
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
231
+ *
232
+ * # Example (C)
233
+ *
234
+ * ```c
235
+ * const char* path = "/path/to/document.pdf";
236
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
237
+ * if (result != NULL && result->success) {
238
+ * printf("Content: %s\n", result->content);
239
+ * printf("MIME: %s\n", result->mime_type);
240
+ * kreuzberg_free_result(result);
241
+ * } else {
242
+ * const char* error = kreuzberg_last_error();
243
+ * printf("Error: %s\n", error);
244
+ * }
245
+ * ```
246
+ */
247
+ struct CExtractionResult *kreuzberg_extract_file_sync(const char *file_path);
248
+
249
+ /**
250
+ * Detect MIME type from a file path.
251
+ *
252
+ * # Safety
253
+ *
254
+ * - `file_path` must be a valid null-terminated C string
255
+ * - The returned string must be freed with `kreuzberg_free_string`
256
+ * - Returns NULL on error (check `kreuzberg_last_error`)
257
+ */
258
+ char *kreuzberg_detect_mime_type(const char *file_path, bool check_exists);
259
+
260
+ /**
261
+ * Validate that a MIME type is supported by Kreuzberg.
262
+ *
263
+ * # Safety
264
+ *
265
+ * - `mime_type` must be a valid null-terminated C string
266
+ * - The returned string must be freed with `kreuzberg_free_string`
267
+ * - Returns NULL on error (check `kreuzberg_last_error`)
268
+ */
269
+ char *kreuzberg_validate_mime_type(const char *mime_type);
270
+
271
+ /**
272
+ * List available embedding preset names.
273
+ *
274
+ * # Safety
275
+ *
276
+ * - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
277
+ * - Returns NULL on error (check `kreuzberg_last_error`)
278
+ */
279
+ char *kreuzberg_list_embedding_presets(void);
280
+
281
+ /**
282
+ * Get a specific embedding preset by name.
283
+ *
284
+ * # Safety
285
+ *
286
+ * - `name` must be a valid null-terminated C string
287
+ * - Returned string is JSON object and must be freed with `kreuzberg_free_string`
288
+ * - Returns NULL on error (check `kreuzberg_last_error`)
289
+ */
290
+ char *kreuzberg_get_embedding_preset(const char *name);
291
+
292
+ /**
293
+ * Extract text and metadata from a file with custom configuration (synchronous).
294
+ *
295
+ * # Safety
296
+ *
297
+ * - `file_path` must be a valid null-terminated C string
298
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
299
+ * - The returned pointer must be freed with `kreuzberg_free_result`
300
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
301
+ *
302
+ * # Example (C)
303
+ *
304
+ * ```c
305
+ * const char* path = "/path/to/document.pdf";
306
+ * const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
307
+ * CExtractionResult* result = kreuzberg_extract_file_sync_with_config(path, config);
308
+ * if (result != NULL && result->success) {
309
+ * printf("Content: %s\n", result->content);
310
+ * kreuzberg_free_result(result);
311
+ * }
312
+ * ```
313
+ */
314
+ struct CExtractionResult *kreuzberg_extract_file_sync_with_config(const char *file_path,
315
+ const char *config_json);
316
+
317
+ /**
318
+ * Extract text and metadata from byte array (synchronous).
319
+ *
320
+ * # Safety
321
+ *
322
+ * - `data` must be a valid pointer to a byte array of length `data_len`
323
+ * - `mime_type` must be a valid null-terminated C string
324
+ * - The returned pointer must be freed with `kreuzberg_free_result`
325
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
326
+ *
327
+ * # Example (C)
328
+ *
329
+ * ```c
330
+ * const uint8_t* data = ...; // Document bytes
331
+ * size_t len = ...; // Length of data
332
+ * const char* mime = "application/pdf";
333
+ * CExtractionResult* result = kreuzberg_extract_bytes_sync(data, len, mime);
334
+ * if (result != NULL && result->success) {
335
+ * printf("Content: %s\n", result->content);
336
+ * kreuzberg_free_result(result);
337
+ * } else {
338
+ * const char* error = kreuzberg_last_error();
339
+ * printf("Error: %s\n", error);
340
+ * }
341
+ * ```
342
+ */
343
+ struct CExtractionResult *kreuzberg_extract_bytes_sync(const uint8_t *data,
344
+ uintptr_t data_len,
345
+ const char *mime_type);
346
+
347
+ /**
348
+ * Extract text and metadata from byte array with custom configuration (synchronous).
349
+ *
350
+ * # Safety
351
+ *
352
+ * - `data` must be a valid pointer to a byte array of length `data_len`
353
+ * - `mime_type` must be a valid null-terminated C string
354
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
355
+ * - The returned pointer must be freed with `kreuzberg_free_result`
356
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
357
+ *
358
+ * # Example (C)
359
+ *
360
+ * ```c
361
+ * const uint8_t* data = ...; // Document bytes
362
+ * size_t len = ...; // Length of data
363
+ * const char* mime = "application/pdf";
364
+ * const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
365
+ * CExtractionResult* result = kreuzberg_extract_bytes_sync_with_config(data, len, mime, config);
366
+ * if (result != NULL && result->success) {
367
+ * printf("Content: %s\n", result->content);
368
+ * kreuzberg_free_result(result);
369
+ * }
370
+ * ```
371
+ */
372
+ struct CExtractionResult *kreuzberg_extract_bytes_sync_with_config(const uint8_t *data,
373
+ uintptr_t data_len,
374
+ const char *mime_type,
375
+ const char *config_json);
376
+
377
+ /**
378
+ * Batch extract text and metadata from multiple files (synchronous).
379
+ *
380
+ * # Safety
381
+ *
382
+ * - `file_paths` must be a valid pointer to an array of null-terminated C strings
383
+ * - `count` must be the number of file paths in the array
384
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
385
+ * - The returned pointer must be freed with `kreuzberg_free_batch_result`
386
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
387
+ */
388
+ struct CBatchResult *kreuzberg_batch_extract_files_sync(const char *const *file_paths,
389
+ uintptr_t count,
390
+ const char *config_json);
391
+
392
+ /**
393
+ * Batch extract text and metadata from multiple byte arrays (synchronous).
394
+ *
395
+ * # Safety
396
+ *
397
+ * - `items` must be a valid pointer to an array of CBytesWithMime structures
398
+ * - `count` must be the number of items in the array
399
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
400
+ * - The returned pointer must be freed with `kreuzberg_free_batch_result`
401
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
402
+ */
403
+ struct CBatchResult *kreuzberg_batch_extract_bytes_sync(const struct CBytesWithMime *items,
404
+ uintptr_t count,
405
+ const char *config_json);
406
+
407
+ /**
408
+ * Load an extraction configuration from a TOML/YAML/JSON file.
409
+ *
410
+ * # Safety
411
+ *
412
+ * - `file_path` must be a valid null-terminated C string
413
+ * - The returned string must be freed with `kreuzberg_free_string`
414
+ * - Returns NULL on error (check `kreuzberg_last_error`)
415
+ */
416
+ char *kreuzberg_load_extraction_config_from_file(const char *file_path);
417
+
418
+ /**
419
+ * Free a batch result returned by batch extraction functions.
420
+ *
421
+ * # Safety
422
+ *
423
+ * - `batch_result` must be a pointer previously returned by a batch extraction function
424
+ * - `batch_result` can be NULL (no-op)
425
+ * - `batch_result` must not be used after this call
426
+ * - All results and strings within the batch result will be freed automatically
427
+ */
428
+ void kreuzberg_free_batch_result(struct CBatchResult *batch_result);
429
+
430
+ /**
431
+ * Free a string returned by Kreuzberg functions.
432
+ *
433
+ * # Safety
434
+ *
435
+ * - `s` must be a string previously returned by a Kreuzberg function
436
+ * - `s` can be NULL (no-op)
437
+ * - `s` must not be used after this call
438
+ *
439
+ * # Example (C)
440
+ *
441
+ * ```c
442
+ * char* str = result->content;
443
+ * kreuzberg_free_string(str);
444
+ * // str is now invalid
445
+ * ```
446
+ */
447
+ void kreuzberg_free_string(char *s);
448
+
449
+ /**
450
+ * Clone a null-terminated string using Rust's allocator.
451
+ *
452
+ * # Safety
453
+ *
454
+ * - `s` must be a valid null-terminated UTF-8 string
455
+ * - Returned pointer must be freed with `kreuzberg_free_string`
456
+ * - Returns NULL on error (check `kreuzberg_last_error`)
457
+ */
458
+ char *kreuzberg_clone_string(const char *s);
459
+
460
+ /**
461
+ * Free an extraction result returned by `kreuzberg_extract_file_sync`.
462
+ *
463
+ * # Safety
464
+ *
465
+ * - `result` must be a pointer previously returned by `kreuzberg_extract_file_sync`
466
+ * - `result` can be NULL (no-op)
467
+ * - `result` must not be used after this call
468
+ * - All string fields within the result will be freed automatically
469
+ *
470
+ * # Example (C)
471
+ *
472
+ * ```c
473
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
474
+ * // Use result...
475
+ * kreuzberg_free_result(result);
476
+ * // result is now invalid
477
+ * ```
478
+ */
479
+ void kreuzberg_free_result(struct CExtractionResult *result);
480
+
481
+ /**
482
+ * Get the last error message from a failed operation.
483
+ *
484
+ * # Safety
485
+ *
486
+ * - Returns a static string that does not need to be freed
487
+ * - Returns NULL if no error has occurred
488
+ * - The returned string is valid until the next Kreuzberg function call on the same thread
489
+ *
490
+ * # Example (C)
491
+ *
492
+ * ```c
493
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
494
+ * if (result == NULL) {
495
+ * const char* error = kreuzberg_last_error();
496
+ * if (error != NULL) {
497
+ * printf("Error: %s\n", error);
498
+ * }
499
+ * }
500
+ * ```
501
+ */
502
+ const char *kreuzberg_last_error(void);
503
+
504
+ /**
505
+ * Get the error code for the last error.
506
+ *
507
+ * Returns the error code as an i32. Error codes are defined in ErrorCode enum:
508
+ * - 0: Success (no error)
509
+ * - 1: GenericError
510
+ * - 2: Panic
511
+ * - 3: InvalidArgument
512
+ * - 4: IoError
513
+ * - 5: ParsingError
514
+ * - 6: OcrError
515
+ * - 7: MissingDependency
516
+ *
517
+ * # Safety
518
+ *
519
+ * This function is thread-safe and always safe to call.
520
+ *
521
+ * # Example (C)
522
+ *
523
+ * ```c
524
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
525
+ * if (result == NULL) {
526
+ * int32_t code = kreuzberg_last_error_code();
527
+ * if (code == 2) {
528
+ * // A panic occurred
529
+ * }
530
+ * }
531
+ * ```
532
+ */
533
+ int32_t kreuzberg_last_error_code(void);
534
+
535
+ /**
536
+ * Get the panic context for the last error (if it was a panic).
537
+ *
538
+ * Returns a JSON string containing panic context information, or NULL if
539
+ * the last error was not a panic.
540
+ *
541
+ * The JSON structure contains:
542
+ * - file: Source file where panic occurred
543
+ * - line: Line number
544
+ * - function: Function name
545
+ * - message: Panic message
546
+ * - timestamp_secs: Unix timestamp (seconds since epoch)
547
+ *
548
+ * # Safety
549
+ *
550
+ * The returned string must be freed with kreuzberg_free_string().
551
+ *
552
+ * # Example (C)
553
+ *
554
+ * ```c
555
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
556
+ * if (result == NULL && kreuzberg_last_error_code() == 2) {
557
+ * const char* context = kreuzberg_last_panic_context();
558
+ * if (context != NULL) {
559
+ * printf("Panic context: %s\n", context);
560
+ * kreuzberg_free_string((char*)context);
561
+ * }
562
+ * }
563
+ * ```
564
+ */
565
+ char *kreuzberg_last_panic_context(void);
566
+
567
+ /**
568
+ * Get the library version string.
569
+ *
570
+ * # Safety
571
+ *
572
+ * - Returns a static string that does not need to be freed
573
+ * - The returned string is always valid
574
+ *
575
+ * # Example (C)
576
+ *
577
+ * ```c
578
+ * const char* version = kreuzberg_version();
579
+ * printf("Kreuzberg version: %s\n", version);
580
+ * ```
581
+ */
582
+ const char *kreuzberg_version(void);
583
+
584
+ /**
585
+ * Register a custom OCR backend via FFI callback.
586
+ *
587
+ * # Safety
588
+ *
589
+ * - `name` must be a valid null-terminated C string
590
+ * - `callback` must be a valid function pointer that:
591
+ * - Does not store the image_bytes pointer
592
+ * - Returns a null-terminated UTF-8 string or NULL on error
593
+ * - The returned string must be freeable by kreuzberg_free_string
594
+ * - Returns true on success, false on error (check kreuzberg_last_error)
595
+ *
596
+ * # Example (C)
597
+ *
598
+ * ```c
599
+ * char* my_ocr_backend(const uint8_t* image_bytes, size_t image_length, const char* config_json) {
600
+ * // Implement OCR logic here
601
+ * // Return allocated string with result, or NULL on error
602
+ * return strdup("Extracted text");
603
+ * }
604
+ *
605
+ * bool success = kreuzberg_register_ocr_backend("my-ocr", my_ocr_backend);
606
+ * if (!success) {
607
+ * const char* error = kreuzberg_last_error();
608
+ * printf("Failed to register: %s\n", error);
609
+ * }
610
+ * ```
611
+ */
612
+ bool kreuzberg_register_ocr_backend(const char *name, OcrBackendCallback callback);
613
+
614
+ /**
615
+ * Register a custom OCR backend with explicit language support via FFI callback.
616
+ *
617
+ * # Safety
618
+ *
619
+ * - `languages_json` must be a null-terminated JSON array of language codes or NULL
620
+ * - See `kreuzberg_register_ocr_backend` for additional safety notes.
621
+ */
622
+ bool kreuzberg_register_ocr_backend_with_languages(const char *name,
623
+ OcrBackendCallback callback,
624
+ const char *languages_json);
625
+
626
+ /**
627
+ * Register a custom PostProcessor via FFI callback.
628
+ *
629
+ * # Safety
630
+ *
631
+ * - `name` must be a valid null-terminated C string
632
+ * - `callback` must be a valid function pointer that:
633
+ * - Does not store the result_json pointer
634
+ * - Returns a null-terminated UTF-8 JSON string or NULL on error
635
+ * - The returned string must be freeable by kreuzberg_free_string
636
+ * - `priority` determines the order of execution (higher priority runs first)
637
+ * - Returns true on success, false on error (check kreuzberg_last_error)
638
+ *
639
+ * # Example (C)
640
+ *
641
+ * ```c
642
+ * char* my_post_processor(const char* result_json) {
643
+ * // Parse result_json, modify it, return JSON string
644
+ * return strdup("{\"content\":\"PROCESSED\"}");
645
+ * }
646
+ *
647
+ * bool success = kreuzberg_register_post_processor("my-processor", my_post_processor, 100);
648
+ * if (!success) {
649
+ * const char* error = kreuzberg_last_error();
650
+ * printf("Failed to register: %s\n", error);
651
+ * }
652
+ * ```
653
+ */
654
+ bool kreuzberg_register_post_processor(const char *name,
655
+ PostProcessorCallback callback,
656
+ int32_t priority);
657
+
658
+ /**
659
+ * Register a custom PostProcessor with an explicit processing stage.
660
+ *
661
+ * # Safety
662
+ *
663
+ * - `name` must be a valid null-terminated C string
664
+ * - `stage` must be a valid null-terminated C string containing "early", "middle", or "late"
665
+ * - `callback` must be a valid function pointer that:
666
+ * - Does not store the result_json pointer
667
+ * - Returns a null-terminated UTF-8 JSON string or NULL on error
668
+ * - The returned string must be freeable by kreuzberg_free_string
669
+ * - `priority` determines the order of execution within the stage (higher priority runs first)
670
+ * - Returns true on success, false on error (check kreuzberg_last_error)
671
+ */
672
+ bool kreuzberg_register_post_processor_with_stage(const char *name,
673
+ PostProcessorCallback callback,
674
+ int32_t priority,
675
+ const char *stage);
676
+
677
+ /**
678
+ * Unregister a PostProcessor by name.
679
+ *
680
+ * # Safety
681
+ *
682
+ * - `name` must be a valid null-terminated C string
683
+ * - Returns true on success, false on error (check kreuzberg_last_error)
684
+ *
685
+ * # Example (C)
686
+ *
687
+ * ```c
688
+ * bool success = kreuzberg_unregister_post_processor("my-processor");
689
+ * if (!success) {
690
+ * const char* error = kreuzberg_last_error();
691
+ * printf("Failed to unregister: %s\n", error);
692
+ * }
693
+ * ```
694
+ */
695
+ bool kreuzberg_unregister_post_processor(const char *name);
696
+
697
+ /**
698
+ * Clear all registered PostProcessors.
699
+ *
700
+ * # Safety
701
+ *
702
+ * - Removes all registered processors. Subsequent extractions will run without them.
703
+ * - Returns true on success, false on error.
704
+ */
705
+ bool kreuzberg_clear_post_processors(void);
706
+
707
+ /**
708
+ * List all registered PostProcessors as a JSON array of names.
709
+ *
710
+ * # Safety
711
+ *
712
+ * - Returned string must be freed with `kreuzberg_free_string`.
713
+ * - Returns NULL on error (check `kreuzberg_last_error`).
714
+ */
715
+ char *kreuzberg_list_post_processors(void);
716
+
717
+ /**
718
+ * Register a custom DocumentExtractor via FFI callback.
719
+ *
720
+ * # Safety
721
+ *
722
+ * - `name` must be a valid null-terminated C string
723
+ * - `callback` must be a valid function pointer that:
724
+ * - Does not store the content, mime_type, or config_json pointers
725
+ * - Returns a null-terminated UTF-8 JSON string or NULL on error
726
+ * - The returned string must be freeable by kreuzberg_free_string
727
+ * - `mime_types` must be a valid null-terminated C string containing comma-separated MIME types
728
+ * - `priority` determines the order of selection (higher priority preferred)
729
+ * - Returns true on success, false on error (check kreuzberg_last_error)
730
+ *
731
+ * # Example (C)
732
+ *
733
+ * ```c
734
+ * char* my_extractor(const uint8_t* content, size_t len, const char* mime_type, const char* config) {
735
+ * // Extract content from bytes, return JSON ExtractionResult
736
+ * return strdup("{\"content\":\"extracted text\",\"mime_type\":\"text/plain\",\"metadata\":{}}");
737
+ * }
738
+ *
739
+ * bool success = kreuzberg_register_document_extractor(
740
+ * "my-extractor",
741
+ * my_extractor,
742
+ * "application/x-custom,text/x-custom",
743
+ * 100
744
+ * );
745
+ * if (!success) {
746
+ * const char* error = kreuzberg_last_error();
747
+ * printf("Failed to register: %s\n", error);
748
+ * }
749
+ * ```
750
+ */
751
+ bool kreuzberg_register_document_extractor(const char *name,
752
+ DocumentExtractorCallback callback,
753
+ const char *mime_types,
754
+ int32_t priority);
755
+
756
+ /**
757
+ * Unregister a DocumentExtractor by name.
758
+ *
759
+ * # Safety
760
+ *
761
+ * - `name` must be a valid null-terminated C string
762
+ * - Returns true on success, false on error (check kreuzberg_last_error)
763
+ *
764
+ * # Example (C)
765
+ *
766
+ * ```c
767
+ * bool success = kreuzberg_unregister_document_extractor("my-extractor");
768
+ * if (!success) {
769
+ * const char* error = kreuzberg_last_error();
770
+ * printf("Failed to unregister: %s\n", error);
771
+ * }
772
+ * ```
773
+ */
774
+ bool kreuzberg_unregister_document_extractor(const char *name);
775
+
776
+ /**
777
+ * List all registered DocumentExtractors as a JSON array of names.
778
+ *
779
+ * # Safety
780
+ *
781
+ * - Returned string must be freed with `kreuzberg_free_string`.
782
+ * - Returns NULL on error (check `kreuzberg_last_error`).
783
+ */
784
+ char *kreuzberg_list_document_extractors(void);
785
+
786
+ /**
787
+ * Register a custom Validator via FFI callback.
788
+ *
789
+ * # Safety
790
+ *
791
+ * - `name` must be a valid null-terminated C string
792
+ * - `callback` must be a valid function pointer that:
793
+ * - Does not store the result_json pointer
794
+ * - Returns a null-terminated UTF-8 string (error message) if validation fails
795
+ * - Returns NULL if validation passes
796
+ * - The returned string must be freeable by kreuzberg_free_string
797
+ * - `priority` determines the order of validation (higher priority runs first)
798
+ * - Returns true on success, false on error (check kreuzberg_last_error)
799
+ *
800
+ * # Example (C)
801
+ *
802
+ * ```c
803
+ * char* my_validator(const char* result_json) {
804
+ * // Parse result_json, validate it
805
+ * // Return error message if validation fails, NULL if passes
806
+ * if (invalid) {
807
+ * return strdup("Validation failed: content too short");
808
+ * }
809
+ * return NULL;
810
+ * }
811
+ *
812
+ * bool success = kreuzberg_register_validator("my-validator", my_validator, 100);
813
+ * if (!success) {
814
+ * const char* error = kreuzberg_last_error();
815
+ * printf("Failed to register: %s\n", error);
816
+ * }
817
+ * ```
818
+ */
819
+ bool kreuzberg_register_validator(const char *name, ValidatorCallback callback, int32_t priority);
820
+
821
+ /**
822
+ * Unregister a Validator by name.
823
+ *
824
+ * # Safety
825
+ *
826
+ * - `name` must be a valid null-terminated C string
827
+ * - Returns true on success, false on error (check kreuzberg_last_error)
828
+ *
829
+ * # Example (C)
830
+ *
831
+ * ```c
832
+ * bool success = kreuzberg_unregister_validator("my-validator");
833
+ * if (!success) {
834
+ * const char* error = kreuzberg_last_error();
835
+ * printf("Failed to unregister: %s\n", error);
836
+ * }
837
+ * ```
838
+ */
839
+ bool kreuzberg_unregister_validator(const char *name);
840
+
841
+ /**
842
+ * Clear all registered Validators.
843
+ *
844
+ * # Safety
845
+ *
846
+ * - Removes all validators. Subsequent extractions will skip custom validation.
847
+ * - Returns true on success, false on error.
848
+ */
849
+ bool kreuzberg_clear_validators(void);
850
+
851
+ /**
852
+ * List all registered Validators as a JSON array of names.
853
+ *
854
+ * # Safety
855
+ *
856
+ * - Returned string must be freed with `kreuzberg_free_string`.
857
+ * - Returns NULL on error (check `kreuzberg_last_error`).
858
+ */
859
+ char *kreuzberg_list_validators(void);
860
+
861
+ /**
862
+ * Unregister an OCR backend by name.
863
+ *
864
+ * # Safety
865
+ *
866
+ * - `name` must be a valid null-terminated C string
867
+ * - Returns true on success, false on error (check kreuzberg_last_error)
868
+ *
869
+ * # Example (C)
870
+ *
871
+ * ```c
872
+ * bool success = kreuzberg_unregister_ocr_backend("custom-ocr");
873
+ * if (!success) {
874
+ * const char* error = kreuzberg_last_error();
875
+ * printf("Failed to unregister: %s\n", error);
876
+ * }
877
+ * ```
878
+ */
879
+ bool kreuzberg_unregister_ocr_backend(const char *name);
880
+
881
+ /**
882
+ * List all registered OCR backends as a JSON array of names.
883
+ *
884
+ * # Safety
885
+ *
886
+ * - Returned string must be freed with `kreuzberg_free_string`.
887
+ * - Returns NULL on error (check `kreuzberg_last_error`).
888
+ *
889
+ * # Example (C)
890
+ *
891
+ * ```c
892
+ * char* backends = kreuzberg_list_ocr_backends();
893
+ * if (backends == NULL) {
894
+ * const char* error = kreuzberg_last_error();
895
+ * printf("Failed to list backends: %s\n", error);
896
+ * } else {
897
+ * printf("OCR backends: %s\n", backends);
898
+ * kreuzberg_free_string(backends);
899
+ * }
900
+ * ```
901
+ */
902
+ char *kreuzberg_list_ocr_backends(void);
903
+
904
+ /**
905
+ * Clear all registered OCR backends.
906
+ *
907
+ * # Safety
908
+ *
909
+ * - Removes all registered OCR backends. Subsequent extractions will use only built-in backends.
910
+ * - Returns true on success, false on error.
911
+ *
912
+ * # Example (C)
913
+ *
914
+ * ```c
915
+ * bool success = kreuzberg_clear_ocr_backends();
916
+ * if (!success) {
917
+ * const char* error = kreuzberg_last_error();
918
+ * printf("Failed to clear OCR backends: %s\n", error);
919
+ * }
920
+ * ```
921
+ */
922
+ bool kreuzberg_clear_ocr_backends(void);
923
+
924
+ /**
925
+ * Clear all registered DocumentExtractors.
926
+ *
927
+ * # Safety
928
+ *
929
+ * - Removes all registered extractors. Subsequent extractions will use only built-in extractors.
930
+ * - Returns true on success, false on error.
931
+ *
932
+ * # Example (C)
933
+ *
934
+ * ```c
935
+ * bool success = kreuzberg_clear_document_extractors();
936
+ * if (!success) {
937
+ * const char* error = kreuzberg_last_error();
938
+ * printf("Failed to clear document extractors: %s\n", error);
939
+ * }
940
+ * ```
941
+ */
942
+ bool kreuzberg_clear_document_extractors(void);
943
+
944
+ /**
945
+ * Detect MIME type from raw bytes.
946
+ *
947
+ * # Safety
948
+ *
949
+ * - `bytes` must be a valid pointer to byte data
950
+ * - `len` must be the correct length of the byte array
951
+ * - The returned string must be freed with `kreuzberg_free_string`
952
+ * - Returns NULL on error (check `kreuzberg_last_error`)
953
+ *
954
+ * # Example (C)
955
+ *
956
+ * ```c
957
+ * const char* pdf_bytes = "%PDF-1.4\n";
958
+ * char* mime = kreuzberg_detect_mime_type_from_bytes((const uint8_t*)pdf_bytes, strlen(pdf_bytes));
959
+ * if (mime == NULL) {
960
+ * const char* error = kreuzberg_last_error();
961
+ * printf("Failed to detect MIME type: %s\n", error);
962
+ * } else {
963
+ * printf("MIME type: %s\n", mime);
964
+ * kreuzberg_free_string(mime);
965
+ * }
966
+ * ```
967
+ */
968
+ char *kreuzberg_detect_mime_type_from_bytes(const uint8_t *bytes, uintptr_t len);
969
+
970
+ /**
971
+ * Detect MIME type from file path (checks extension and reads file content).
972
+ *
973
+ * # Safety
974
+ *
975
+ * - `file_path` must be a valid null-terminated C string
976
+ * - The returned string must be freed with `kreuzberg_free_string`
977
+ * - Returns NULL on error (check `kreuzberg_last_error`)
978
+ *
979
+ * # Example (C)
980
+ *
981
+ * ```c
982
+ * char* mime = kreuzberg_detect_mime_type_from_path("document.pdf");
983
+ * if (mime == NULL) {
984
+ * const char* error = kreuzberg_last_error();
985
+ * printf("Failed to detect MIME type: %s\n", error);
986
+ * } else {
987
+ * printf("MIME type: %s\n", mime);
988
+ * kreuzberg_free_string(mime);
989
+ * }
990
+ * ```
991
+ */
992
+ char *kreuzberg_detect_mime_type_from_path(const char *file_path);
993
+
994
+ /**
995
+ * Get file extensions for a MIME type.
996
+ *
997
+ * # Safety
998
+ *
999
+ * - `mime_type` must be a valid null-terminated C string
1000
+ * - The returned string is a JSON array of extensions (must be freed with `kreuzberg_free_string`)
1001
+ * - Returns NULL on error (check `kreuzberg_last_error`)
1002
+ *
1003
+ * # Example (C)
1004
+ *
1005
+ * ```c
1006
+ * char* extensions = kreuzberg_get_extensions_for_mime("application/pdf");
1007
+ * if (extensions == NULL) {
1008
+ * const char* error = kreuzberg_last_error();
1009
+ * printf("Failed to get extensions: %s\n", error);
1010
+ * } else {
1011
+ * printf("Extensions: %s\n", extensions);
1012
+ * kreuzberg_free_string(extensions);
1013
+ * }
1014
+ * ```
1015
+ */
1016
+ char *kreuzberg_get_extensions_for_mime(const char *mime_type);
1017
+
1018
+ /**
1019
+ * Load an ExtractionConfig from a file.
1020
+ *
1021
+ * Automatically detects the file format based on extension:
1022
+ * - `.toml` - TOML format
1023
+ * - `.yaml`, `.yml` - YAML format
1024
+ * - `.json` - JSON format
1025
+ *
1026
+ * # Safety
1027
+ *
1028
+ * - `path` must be a valid null-terminated C string representing a file path
1029
+ * - Returns a pointer to ExtractionConfig on success, NULL on error
1030
+ * - The returned config must be freed with `kreuzberg_free_config`
1031
+ * - Check `kreuzberg_last_error` on NULL return
1032
+ *
1033
+ * # Example (C)
1034
+ *
1035
+ * ```c
1036
+ * ExtractionConfig* config = kreuzberg_config_from_file("kreuzberg.toml");
1037
+ * if (config == NULL) {
1038
+ * const char* error = kreuzberg_last_error();
1039
+ * printf("Failed to load config: %s\n", error);
1040
+ * return 1;
1041
+ * }
1042
+ *
1043
+ * // Use config...
1044
+ * char* result = kreuzberg_extract_file_with_config_sync("document.pdf", config);
1045
+ *
1046
+ * kreuzberg_free_config(config);
1047
+ * ```
1048
+ */
1049
+ ExtractionConfig *kreuzberg_config_from_file(const char *path);
1050
+
1051
+ /**
1052
+ * Discover and load an ExtractionConfig by searching parent directories.
1053
+ *
1054
+ * Searches the current directory and all parent directories for:
1055
+ * - `kreuzberg.toml`
1056
+ * - `kreuzberg.yaml`
1057
+ * - `kreuzberg.yml`
1058
+ * - `kreuzberg.json`
1059
+ *
1060
+ * Returns the first config file found as JSON, or NULL if none found.
1061
+ *
1062
+ * # Safety
1063
+ *
1064
+ * - The returned string must be freed with `kreuzberg_free_string`
1065
+ * - Returns NULL if no config found or on error (check `kreuzberg_last_error`)
1066
+ *
1067
+ * # Example (C)
1068
+ *
1069
+ * ```c
1070
+ * char* config_json = kreuzberg_config_discover();
1071
+ * if (config_json == NULL) {
1072
+ * const char* error = kreuzberg_last_error();
1073
+ * if (error != NULL && strlen(error) > 0) {
1074
+ * printf("Error discovering config: %s\n", error);
1075
+ * return 1;
1076
+ * }
1077
+ * // No config found, use defaults
1078
+ * printf("No config file found\n");
1079
+ * } else {
1080
+ * printf("Config: %s\n", config_json);
1081
+ * kreuzberg_free_string(config_json);
1082
+ * }
1083
+ * ```
1084
+ */
1085
+ char *kreuzberg_config_discover(void);
1086
+
1087
+ #endif /* KREUZBERG_FFI_H */