kreuzberg 4.0.0.rc2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +391 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -0,0 +1,3012 @@
1
+ /* Auto-generated C bindings for Kreuzberg */
2
+
3
+ #ifndef KREUZBERG_FFI_H
4
+ #define KREUZBERG_FFI_H
5
+
6
+ #pragma once
7
+
8
+ /* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */
9
+
10
+ #include <stdarg.h>
11
+ #include <stdbool.h>
12
+ #include <stdint.h>
13
+ #include <stdlib.h>
14
+ /**
15
+ * Opaque type for extraction configuration.
16
+ * This is an opaque pointer type - callers should not access its internals.
17
+ */
18
+ typedef struct ExtractionConfig ExtractionConfig;
19
+
20
+ /**
21
+ * Opaque type for extraction result.
22
+ * This is an opaque pointer type - callers should not access its internals.
23
+ * Use the kreuzberg_result_* accessor functions to extract data.
24
+ */
25
+ typedef struct ExtractionResult ExtractionResult;
26
+
27
+
28
+ typedef struct Option_ErrorCallback Option_ErrorCallback;
29
+
30
+ /**
31
+ * Memory pool for ExtractionResult objects.
32
+ *
33
+ * Pre-allocates storage and reuses memory across multiple extractions.
34
+ * Thread-safe with internal synchronization.
35
+ *
36
+ * # Memory Model
37
+ *
38
+ * - Results are owned by the pool until reset or freed
39
+ * - Pool grows automatically if capacity is exceeded
40
+ * - Reset clears all results but retains capacity
41
+ * - Free releases all memory and destroys pool
42
+ *
43
+ * # Thread Safety
44
+ *
45
+ * Pool uses internal Mutex for synchronization. Safe for concurrent access
46
+ * but may serialize extractions. For parallel processing, consider using
47
+ * separate pools per thread.
48
+ */
49
+ typedef struct ResultPool ResultPool;
50
+
51
+ /**
52
+ * Zero-copy view into an ExtractionResult.
53
+ *
54
+ * Provides direct pointers to string data without allocation or copying.
55
+ * All pointers are valid UTF-8 byte slices (not null-terminated).
56
+ *
57
+ * # Lifetime Safety
58
+ *
59
+ * This structure contains borrowed pointers. The caller MUST ensure:
60
+ * - The source `ExtractionResult` outlives this view
61
+ * - No use after the source result is freed with `kreuzberg_result_free()`
62
+ *
63
+ * # Memory Layout
64
+ *
65
+ * Field order: 6 ptr+len pairs (96 bytes) + 5 counts (40 bytes) = 136 bytes on 64-bit systems
66
+ * All pointers are either valid UTF-8 data or NULL (with corresponding len=0).
67
+ *
68
+ * # Thread Safety
69
+ *
70
+ * Views are NOT thread-safe. External synchronization required for concurrent access.
71
+ */
72
+ typedef struct CExtractionResultView {
73
+ /**
74
+ * Direct pointer to content bytes (UTF-8, not null-terminated)
75
+ */
76
+ const uint8_t *content_ptr;
77
+ /**
78
+ * Length of content in bytes
79
+ */
80
+ uintptr_t content_len;
81
+ /**
82
+ * Direct pointer to MIME type bytes (UTF-8, not null-terminated)
83
+ */
84
+ const uint8_t *mime_type_ptr;
85
+ /**
86
+ * Length of MIME type in bytes
87
+ */
88
+ uintptr_t mime_type_len;
89
+ /**
90
+ * Direct pointer to language bytes (UTF-8, not null-terminated), or NULL
91
+ */
92
+ const uint8_t *language_ptr;
93
+ /**
94
+ * Length of language in bytes (0 if NULL)
95
+ */
96
+ uintptr_t language_len;
97
+ /**
98
+ * Direct pointer to date bytes (UTF-8, not null-terminated), or NULL
99
+ */
100
+ const uint8_t *date_ptr;
101
+ /**
102
+ * Length of date in bytes (0 if NULL)
103
+ */
104
+ uintptr_t date_len;
105
+ /**
106
+ * Direct pointer to subject bytes (UTF-8, not null-terminated), or NULL
107
+ */
108
+ const uint8_t *subject_ptr;
109
+ /**
110
+ * Length of subject in bytes (0 if NULL)
111
+ */
112
+ uintptr_t subject_len;
113
+ /**
114
+ * Direct pointer to title bytes (UTF-8, not null-terminated), or NULL
115
+ */
116
+ const uint8_t *title_ptr;
117
+ /**
118
+ * Length of title in bytes (0 if NULL)
119
+ */
120
+ uintptr_t title_len;
121
+ /**
122
+ * Number of tables extracted
123
+ */
124
+ uintptr_t table_count;
125
+ /**
126
+ * Number of chunks (0 if chunking not enabled)
127
+ */
128
+ uintptr_t chunk_count;
129
+ /**
130
+ * Number of detected languages (0 if language detection not enabled)
131
+ */
132
+ uintptr_t detected_language_count;
133
+ /**
134
+ * Number of extracted images (0 if no images)
135
+ */
136
+ uintptr_t image_count;
137
+ /**
138
+ * Total page count (0 if not applicable)
139
+ */
140
+ uintptr_t page_count;
141
+ } CExtractionResultView;
142
+
143
+ /**
144
+ * Callback function invoked for each successfully extracted result.
145
+ *
146
+ * # Arguments
147
+ *
148
+ * * `result` - Borrowed pointer to extraction result (valid only during callback)
149
+ * * `file_index` - Zero-based index of the file in the batch
150
+ * * `user_data` - User-provided context pointer
151
+ *
152
+ * # Returns
153
+ *
154
+ * - `0` to continue processing remaining files
155
+ * - Non-zero to cancel batch processing (no further callbacks)
156
+ *
157
+ * # Safety
158
+ *
159
+ * - `result` pointer is valid only during the callback execution
160
+ * - `result` is automatically freed after callback returns
161
+ * - Caller must copy/serialize data if needed beyond callback scope
162
+ * - `user_data` is passed through opaquely (caller manages lifetime)
163
+ */
164
+ typedef int (*ResultCallback)(const struct CExtractionResultView *result,
165
+ uintptr_t file_index,
166
+ void *user_data);
167
+
168
+ /**
169
+ * C-compatible structured error details returned by `kreuzberg_get_error_details()`.
170
+ *
171
+ * All string fields (message, error_type, source_file, source_function, context_info)
172
+ * are dynamically allocated C strings that MUST be freed using `kreuzberg_free_string()`.
173
+ * Set fields are non-NULL; unset fields are NULL.
174
+ */
175
+ typedef struct CErrorDetails {
176
+ /**
177
+ * The error message (must be freed with kreuzberg_free_string)
178
+ */
179
+ char *message;
180
+ /**
181
+ * Numeric error code (0-7 for Kreuzberg errors, 1-7 for panic_shield codes)
182
+ */
183
+ uint32_t error_code;
184
+ /**
185
+ * Human-readable error type name (must be freed with kreuzberg_free_string)
186
+ */
187
+ char *error_type;
188
+ /**
189
+ * Source file where error occurred (may be NULL)
190
+ */
191
+ char *source_file;
192
+ /**
193
+ * Source function where error occurred (may be NULL)
194
+ */
195
+ char *source_function;
196
+ /**
197
+ * Line number in source file (0 if unknown)
198
+ */
199
+ uint32_t source_line;
200
+ /**
201
+ * Additional context information (may be NULL)
202
+ */
203
+ char *context_info;
204
+ /**
205
+ * 1 if this error originated from a panic, 0 otherwise
206
+ */
207
+ int32_t is_panic;
208
+ } CErrorDetails;
209
+
210
+ /**
211
+ * C-compatible extraction result structure
212
+ *
213
+ * This struct must maintain a stable ABI and memory layout for FFI compatibility.
214
+ *
215
+ * # Memory Layout
216
+ *
217
+ * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
218
+ * Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
219
+ *
220
+ * The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
221
+ * - Fields are laid out in order
222
+ * - Padding is added to maintain alignment
223
+ * - The struct has the same size and alignment on all platforms (for 64-bit)
224
+ *
225
+ * # Memory Management
226
+ *
227
+ * All pointer fields are owned by the caller and must be freed using `kreuzberg_free_string`.
228
+ * The struct itself must be freed using `kreuzberg_free_extraction_result`.
229
+ */
230
+ typedef struct CExtractionResult {
231
+ /**
232
+ * Extracted text content (null-terminated UTF-8 string, must be freed with kreuzberg_free_string)
233
+ */
234
+ char *content;
235
+ /**
236
+ * Detected MIME type (null-terminated string, must be freed with kreuzberg_free_string)
237
+ */
238
+ char *mime_type;
239
+ /**
240
+ * Document language (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
241
+ */
242
+ char *language;
243
+ /**
244
+ * Document date (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
245
+ */
246
+ char *date;
247
+ /**
248
+ * Document subject (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
249
+ */
250
+ char *subject;
251
+ /**
252
+ * Tables as JSON array (null-terminated string, or NULL if no tables, must be freed with kreuzberg_free_string)
253
+ */
254
+ char *tables_json;
255
+ /**
256
+ * Detected languages as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
257
+ */
258
+ char *detected_languages_json;
259
+ /**
260
+ * Metadata as JSON object (null-terminated string, or NULL if no metadata, must be freed with kreuzberg_free_string)
261
+ */
262
+ char *metadata_json;
263
+ /**
264
+ * Text chunks as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
265
+ */
266
+ char *chunks_json;
267
+ /**
268
+ * Extracted images as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
269
+ */
270
+ char *images_json;
271
+ /**
272
+ * Page structure as JSON object (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
273
+ */
274
+ char *page_structure_json;
275
+ /**
276
+ * Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
277
+ */
278
+ char *pages_json;
279
+ /**
280
+ * Whether extraction was successful
281
+ */
282
+ bool success;
283
+ /**
284
+ * Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
285
+ */
286
+ uint8_t _padding1[7];
287
+ } CExtractionResult;
288
+
289
+ /**
290
+ * C-compatible structure for batch extraction results
291
+ *
292
+ * # Memory Layout
293
+ *
294
+ * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
295
+ * Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 bool + 7 bytes padding = 24 bytes total
296
+ *
297
+ * The padding ensures the struct is properly aligned for 64-bit architectures.
298
+ *
299
+ * # Memory Management
300
+ *
301
+ * - The `results` array must be freed using `kreuzberg_free_batch_result`
302
+ * - Each individual result in the array must also be freed
303
+ */
304
+ typedef struct CBatchResult {
305
+ /**
306
+ * Array of extraction results
307
+ */
308
+ struct CExtractionResult **results;
309
+ /**
310
+ * Number of results
311
+ */
312
+ uintptr_t count;
313
+ /**
314
+ * Whether batch operation was successful
315
+ */
316
+ bool success;
317
+ /**
318
+ * Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
319
+ */
320
+ uint8_t _padding2[7];
321
+ } CBatchResult;
322
+
323
+ /**
324
+ * C-compatible structure for passing byte array with MIME type in batch operations
325
+ *
326
+ * # Memory Layout
327
+ *
328
+ * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
329
+ * Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 pointer (8 bytes) = 24 bytes total
330
+ *
331
+ * The `#[repr(C)]` attribute ensures consistent memory layout across languages.
332
+ *
333
+ * # Usage
334
+ *
335
+ * This struct is used to pass document data to batch extraction functions. The caller
336
+ * retains ownership of the data and mime_type pointers.
337
+ */
338
+ typedef struct CBytesWithMime {
339
+ /**
340
+ * Pointer to byte data
341
+ */
342
+ const uint8_t *data;
343
+ /**
344
+ * Length of byte data
345
+ */
346
+ uintptr_t data_len;
347
+ /**
348
+ * MIME type as null-terminated C string
349
+ */
350
+ const char *mime_type;
351
+ } CBytesWithMime;
352
+
353
+ /**
354
+ * Type alias for the DocumentExtractor callback function.
355
+ *
356
+ * # Parameters
357
+ *
358
+ * - `content`: Pointer to document bytes (valid only during the call)
359
+ * - `content_len`: Length of the content in bytes
360
+ * - `mime_type`: Null-terminated MIME type string
361
+ * - `config_json`: Null-terminated JSON configuration string
362
+ *
363
+ * # Returns
364
+ *
365
+ * Null-terminated JSON string containing the ExtractionResult
366
+ * (must be freed by Rust via kreuzberg_free_string), or NULL on error.
367
+ *
368
+ * # Safety
369
+ *
370
+ * The callback must:
371
+ * - Not store the content, mime_type, or config_json pointers (only valid during the call)
372
+ * - Return a valid null-terminated UTF-8 JSON string or NULL on error
373
+ * - The returned string must be freeable by kreuzberg_free_string
374
+ */
375
+ typedef char *(*DocumentExtractorCallback)(const uint8_t *content,
376
+ uintptr_t content_len,
377
+ const char *mime_type,
378
+ const char *config_json);
379
+
380
+ /**
381
+ * Type alias for the OCR backend callback function.
382
+ *
383
+ * # Parameters
384
+ *
385
+ * - `image_bytes`: Raw image bytes
386
+ * - `image_length`: Length of image data in bytes
387
+ * - `config_json`: JSON-encoded OcrConfig (null-terminated string)
388
+ *
389
+ * # Returns
390
+ *
391
+ * Null-terminated string containing extracted text (must be freed by Rust via kreuzberg_free_string),
392
+ * or NULL on error.
393
+ *
394
+ * # Safety
395
+ *
396
+ * The callback must:
397
+ * - Not store the image_bytes pointer (it's only valid for the duration of the call)
398
+ * - Return a valid null-terminated UTF-8 string allocated by the caller
399
+ * - Return NULL on error (error message should be retrievable separately)
400
+ */
401
+ typedef char *(*OcrBackendCallback)(const uint8_t *image_bytes,
402
+ uintptr_t image_length,
403
+ const char *config_json);
404
+
405
+ /**
406
+ * Type alias for the PostProcessor callback function.
407
+ *
408
+ * # Parameters
409
+ *
410
+ * - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
411
+ *
412
+ * # Returns
413
+ *
414
+ * Null-terminated JSON string containing the processed ExtractionResult
415
+ * (must be freed by Rust via kreuzberg_free_string), or NULL on error.
416
+ *
417
+ * # Safety
418
+ *
419
+ * The callback must:
420
+ * - Not store the result_json pointer (it's only valid for the duration of the call)
421
+ * - Return a valid null-terminated UTF-8 JSON string allocated by the caller
422
+ * - Return NULL on error (error message should be retrievable separately)
423
+ */
424
+ typedef char *(*PostProcessorCallback)(const char *result_json);
425
+
426
+ /**
427
+ * Validator callback function type for FFI.
428
+ *
429
+ * This is a C function pointer that validates extraction results.
430
+ *
431
+ * # Safety
432
+ *
433
+ * The callback must:
434
+ * - Not store the result_json pointer (it's only valid for the duration of the call)
435
+ * - Return a valid null-terminated UTF-8 string (error message) if validation fails
436
+ * - Return NULL if validation passes
437
+ * - The returned string must be freeable by kreuzberg_free_string
438
+ */
439
+ typedef char *(*ValidatorCallback)(const char *result_json);
440
+
441
+ /**
442
+ * Metadata field accessor structure
443
+ *
444
+ * Returned by `kreuzberg_result_get_metadata_field()`. Contains the field value
445
+ * as JSON and information about whether the field exists.
446
+ *
447
+ * # Fields
448
+ *
449
+ * * `name` - The field name requested (does not need to be freed)
450
+ * * `json_value` - JSON representation of the field value, or NULL if field doesn't exist
451
+ * * `is_null` - 1 if the field doesn't exist, 0 if it does
452
+ *
453
+ * The `json_value` pointer (if non-NULL) must be freed with `kreuzberg_free_string()`.
454
+ */
455
+ typedef struct CMetadataField {
456
+ const char *name;
457
+ char *json_value;
458
+ int32_t is_null;
459
+ } CMetadataField;
460
+
461
+ /**
462
+ * Statistics for result pool allocation tracking.
463
+ *
464
+ * Provides insight into pool efficiency and memory usage patterns.
465
+ */
466
+ typedef struct CResultPoolStats {
467
+ /**
468
+ * Current number of results stored in pool
469
+ */
470
+ uintptr_t current_count;
471
+ /**
472
+ * Maximum capacity of pool (before automatic growth)
473
+ */
474
+ uintptr_t capacity;
475
+ /**
476
+ * Total number of allocations (successful extractions)
477
+ */
478
+ uintptr_t total_allocations;
479
+ /**
480
+ * Number of times pool capacity was exceeded (triggered growth)
481
+ */
482
+ uintptr_t growth_events;
483
+ /**
484
+ * Estimated memory used by results in bytes
485
+ */
486
+ uintptr_t estimated_memory_bytes;
487
+ } CResultPoolStats;
488
+
489
+ /**
490
+ * Statistics for string interning efficiency tracking.
491
+ */
492
+ typedef struct CStringInternStats {
493
+ /**
494
+ * Number of unique strings currently interned
495
+ */
496
+ uintptr_t unique_count;
497
+ /**
498
+ * Total number of intern requests
499
+ */
500
+ uintptr_t total_requests;
501
+ /**
502
+ * Number of cache hits (string already interned)
503
+ */
504
+ uintptr_t cache_hits;
505
+ /**
506
+ * Number of cache misses (new string added)
507
+ */
508
+ uintptr_t cache_misses;
509
+ /**
510
+ * Estimated memory saved by deduplication (bytes)
511
+ */
512
+ uintptr_t estimated_memory_saved;
513
+ /**
514
+ * Total memory used by interned strings (bytes)
515
+ */
516
+ uintptr_t total_memory_bytes;
517
+ } CStringInternStats;
518
+
519
+ /**
520
+ * Extract multiple files in streaming mode with callback-based result delivery.
521
+ *
522
+ * Processes files one at a time without accumulating results in memory.
523
+ * Each result is passed to the callback and then freed automatically.
524
+ *
525
+ * # Arguments
526
+ *
527
+ * * `files` - Array of null-terminated file path strings
528
+ * * `count` - Number of files in the array
529
+ * * `config_json` - Optional JSON configuration string (NULL for defaults)
530
+ * * `result_callback` - Callback invoked for each successful extraction
531
+ * * `user_data` - Optional user context passed to callbacks
532
+ * * `error_callback` - Optional callback invoked for extraction failures
533
+ *
534
+ * # Returns
535
+ *
536
+ * - `0` on success (all files processed or cancelled by callback)
537
+ * - `-1` on error (invalid arguments, configuration parsing failure)
538
+ *
539
+ * # Error Handling
540
+ *
541
+ * - Individual file failures invoke `error_callback` but don't stop processing
542
+ * - Callback can return non-zero to cancel remaining files
543
+ * - Invalid arguments or config parsing errors return `-1` immediately
544
+ *
545
+ * # Safety
546
+ *
547
+ * - `files` must point to valid array of `count` C string pointers
548
+ * - All file path strings must be valid null-terminated UTF-8
549
+ * - `config_json` must be valid null-terminated UTF-8 if not NULL
550
+ * - `result_callback` must be a valid function pointer
551
+ * - `error_callback` must be a valid function pointer if not NULL
552
+ * - Result pointers passed to callbacks are valid only during callback
553
+ * - Callbacks must not store result pointers for later use
554
+ *
555
+ * # Example (C)
556
+ *
557
+ * ```c
558
+ * int process_result(const CExtractionResultView* result, size_t index, void* data) {
559
+ * // Copy data needed beyond callback scope
560
+ * char content[1024];
561
+ * size_t copy_len = result->content_len < 1024 ? result->content_len : 1023;
562
+ * memcpy(content, result->content_ptr, copy_len);
563
+ * content[copy_len] = '\0';
564
+ * return 0; // Continue
565
+ * }
566
+ *
567
+ * void handle_error(size_t index, const char* msg, void* data) {
568
+ * fprintf(stderr, "File %zu failed: %s\n", index, msg);
569
+ * }
570
+ *
571
+ * const char* files[] = {"a.pdf", "b.txt", "c.docx"};
572
+ * kreuzberg_extract_batch_streaming(files, 3, NULL, process_result, NULL, handle_error);
573
+ * ```
574
+ */
575
+ int kreuzberg_extract_batch_streaming(const char *const *files,
576
+ uintptr_t count,
577
+ const char *config_json,
578
+ ResultCallback result_callback,
579
+ void *user_data,
580
+ struct Option_ErrorCallback error_callback);
581
+
582
+ /**
583
+ * Extract multiple files in parallel streaming mode.
584
+ *
585
+ * Similar to `kreuzberg_extract_batch_streaming` but processes files in parallel
586
+ * using a thread pool. Results are delivered via callback as they complete.
587
+ *
588
+ * # Arguments
589
+ *
590
+ * * `files` - Array of null-terminated file path strings
591
+ * * `count` - Number of files in the array
592
+ * * `config_json` - Optional JSON configuration string (NULL for defaults)
593
+ * * `result_callback` - Thread-safe callback invoked for each successful extraction
594
+ * * `user_data` - Optional user context passed to callbacks (must be thread-safe)
595
+ * * `error_callback` - Optional thread-safe callback invoked for failures
596
+ * * `max_parallel` - Maximum number of parallel extractions (0 = number of CPUs)
597
+ *
598
+ * # Returns
599
+ *
600
+ * - `0` on success (all files processed or cancelled)
601
+ * - `-1` on error (invalid arguments, configuration parsing failure)
602
+ *
603
+ * # Thread Safety
604
+ *
605
+ * - Both callbacks may be invoked concurrently from multiple threads
606
+ * - `user_data` must be thread-safe (e.g., synchronized with mutex)
607
+ * - Callback can set atomic flag to signal cancellation
608
+ *
609
+ * # Safety
610
+ *
611
+ * Same requirements as `kreuzberg_extract_batch_streaming`, plus:
612
+ * - Callbacks must be thread-safe
613
+ * - `user_data` must support concurrent access
614
+ *
615
+ * # Example (C)
616
+ *
617
+ * ```c
618
+ * typedef struct {
619
+ * pthread_mutex_t lock;
620
+ * atomic_int cancel_flag;
621
+ * } BatchContext;
622
+ *
623
+ * int process_result(const CExtractionResultView* result, size_t index, void* data) {
624
+ * BatchContext* ctx = (BatchContext*)data;
625
+ * pthread_mutex_lock(&ctx->lock);
626
+ * // Process result with thread safety
627
+ * pthread_mutex_unlock(&ctx->lock);
628
+ * return atomic_load(&ctx->cancel_flag);
629
+ * }
630
+ * ```
631
+ */
632
+ int kreuzberg_extract_batch_parallel(const char *const *files,
633
+ uintptr_t count,
634
+ const char *config_json,
635
+ ResultCallback result_callback,
636
+ void *user_data,
637
+ struct Option_ErrorCallback error_callback,
638
+ uintptr_t max_parallel);
639
+
640
+ /**
641
+ * Parse an ExtractionConfig from a JSON string.
642
+ *
643
+ * This is the primary FFI entry point for all language bindings to parse
644
+ * configuration from JSON. Replaces the need for each binding to implement
645
+ * its own JSON parsing logic.
646
+ *
647
+ * # Arguments
648
+ *
649
+ * * `json_config` - Null-terminated C string containing JSON configuration
650
+ *
651
+ * # Returns
652
+ *
653
+ * A pointer to an ExtractionConfig struct that MUST be freed with
654
+ * `kreuzberg_config_free`, or NULL on error (check kreuzberg_last_error).
655
+ *
656
+ * # Safety
657
+ *
658
+ * - `json_config` must be a valid null-terminated C string
659
+ * - The returned pointer must be freed with `kreuzberg_config_free`
660
+ * - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
661
+ *
662
+ * # Example (C)
663
+ *
664
+ * ```c
665
+ * const char* config_json = "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}";
666
+ * ExtractionConfig* config = kreuzberg_config_from_json(config_json);
667
+ * if (config == NULL) {
668
+ * printf("Error: %s\n", kreuzberg_last_error());
669
+ * return 1;
670
+ * }
671
+ *
672
+ * // Use config...
673
+ * // char* result = kreuzberg_extract_file_with_config("doc.pdf", config);
674
+ *
675
+ * kreuzberg_config_free(config);
676
+ * ```
677
+ */
678
+ ExtractionConfig *kreuzberg_config_from_json(const char *json_config);
679
+
680
+ /**
681
+ * Free an ExtractionConfig allocated by kreuzberg_config_from_json or similar.
682
+ *
683
+ * # Safety
684
+ *
685
+ * - `config` must be a pointer previously returned by a config creation function
686
+ * - `config` can be NULL (no-op)
687
+ * - `config` must not be used after this call
688
+ *
689
+ * # Example (C)
690
+ *
691
+ * ```c
692
+ * ExtractionConfig* config = kreuzberg_config_from_json("{...}");
693
+ * if (config != NULL) {
694
+ * // Use config...
695
+ * kreuzberg_config_free(config);
696
+ * }
697
+ * ```
698
+ */
699
+ void kreuzberg_config_free(ExtractionConfig *config);
700
+
701
+ /**
702
+ * Validate a JSON config string without parsing it.
703
+ *
704
+ * This function checks if a JSON config string is valid and would parse correctly,
705
+ * without allocating the full ExtractionConfig structure. Useful for validation
706
+ * before committing to parsing.
707
+ *
708
+ * # Arguments
709
+ *
710
+ * * `json_config` - Null-terminated C string containing JSON configuration
711
+ *
712
+ * # Returns
713
+ *
714
+ * - 1 if valid (would parse successfully)
715
+ * - 0 if invalid (check `kreuzberg_last_error` for details)
716
+ *
717
+ * # Safety
718
+ *
719
+ * - `json_config` must be a valid null-terminated C string
720
+ *
721
+ * # Example (C)
722
+ *
723
+ * ```c
724
+ * const char* config_json = "{\"use_cache\": true}";
725
+ * if (kreuzberg_config_is_valid(config_json)) {
726
+ * ExtractionConfig* config = kreuzberg_config_from_json(config_json);
727
+ * // Use config...
728
+ * kreuzberg_config_free(config);
729
+ * } else {
730
+ * printf("Invalid config: %s\n", kreuzberg_last_error());
731
+ * }
732
+ * ```
733
+ */
734
+ int32_t kreuzberg_config_is_valid(const char *json_config);
735
+
736
+ /**
737
+ * Serialize an ExtractionConfig to JSON string.
738
+ *
739
+ * Converts an ExtractionConfig structure to its JSON representation, allowing
740
+ * bindings to serialize configs without reimplementing serialization logic.
741
+ *
742
+ * # Arguments
743
+ *
744
+ * * `config` - Pointer to an ExtractionConfig structure
745
+ *
746
+ * # Returns
747
+ *
748
+ * A pointer to a C string containing JSON that MUST be freed with `kreuzberg_free_string`.
749
+ * Returns NULL on error (check `kreuzberg_last_error`).
750
+ *
751
+ * # Safety
752
+ *
753
+ * - `config` must be a valid pointer to an ExtractionConfig
754
+ * - `config` cannot be NULL
755
+ * - The returned pointer must be freed with `kreuzberg_free_string`
756
+ *
757
+ * # Example (C)
758
+ *
759
+ * ```c
760
+ * ExtractionConfig* config = kreuzberg_config_from_json("{\"use_cache\": true}");
761
+ * if (config != NULL) {
762
+ * char* json = kreuzberg_config_to_json(config);
763
+ * if (json != NULL) {
764
+ * printf("Serialized: %s\n", json);
765
+ * kreuzberg_free_string(json);
766
+ * }
767
+ * kreuzberg_config_free(config);
768
+ * }
769
+ * ```
770
+ */
771
+ char *kreuzberg_config_to_json(const ExtractionConfig *config);
772
+
773
+ /**
774
+ * Get a specific field from config as JSON string.
775
+ *
776
+ * Retrieves a nested field from the configuration by path and returns its JSON
777
+ * representation. Supports dot notation for nested fields (e.g., "ocr.backend").
778
+ *
779
+ * # Arguments
780
+ *
781
+ * * `config` - Pointer to an ExtractionConfig structure
782
+ * * `field_name` - Null-terminated C string with field path (e.g., "use_cache", "ocr.backend")
783
+ *
784
+ * # Returns
785
+ *
786
+ * A pointer to a C string containing the field value as JSON, or NULL if:
787
+ * - The field doesn't exist
788
+ * - An error occurs during serialization
789
+ *
790
+ * The returned pointer (if non-NULL) must be freed with `kreuzberg_free_string`.
791
+ *
792
+ * # Safety
793
+ *
794
+ * - `config` must be a valid pointer to an ExtractionConfig
795
+ * - `field_name` must be a valid null-terminated C string
796
+ * - Neither parameter can be NULL
797
+ *
798
+ * # Example (C)
799
+ *
800
+ * ```c
801
+ * ExtractionConfig* config = kreuzberg_config_from_json(
802
+ * "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}"
803
+ * );
804
+ * if (config != NULL) {
805
+ * char* use_cache = kreuzberg_config_get_field(config, "use_cache");
806
+ * char* backend = kreuzberg_config_get_field(config, "ocr.backend");
807
+ *
808
+ * if (use_cache != NULL) {
809
+ * printf("use_cache: %s\n", use_cache);
810
+ * kreuzberg_free_string(use_cache);
811
+ * }
812
+ *
813
+ * if (backend != NULL) {
814
+ * printf("backend: %s\n", backend);
815
+ * kreuzberg_free_string(backend);
816
+ * }
817
+ *
818
+ * kreuzberg_config_free(config);
819
+ * }
820
+ * ```
821
+ */
822
+ char *kreuzberg_config_get_field(const ExtractionConfig *config, const char *field_name);
823
+
824
+ /**
825
+ * Merge two configs (override takes precedence over base).
826
+ *
827
+ * Performs a shallow merge of two ExtractionConfig structures, where fields
828
+ * from `override_config` take precedence over fields in `base`. The `base`
829
+ * config is modified in-place.
830
+ *
831
+ * # Arguments
832
+ *
833
+ * * `base` - Pointer to the base ExtractionConfig (will be modified)
834
+ * * `override_config` - Pointer to the override ExtractionConfig (read-only)
835
+ *
836
+ * # Returns
837
+ *
838
+ * - 1 on success
839
+ * - 0 on error (check `kreuzberg_last_error`)
840
+ *
841
+ * # Safety
842
+ *
843
+ * - `base` must be a valid mutable pointer to an ExtractionConfig
844
+ * - `override_config` must be a valid pointer to an ExtractionConfig
845
+ * - Neither parameter can be NULL
846
+ * - `base` is modified in-place
847
+ *
848
+ * # Example (C)
849
+ *
850
+ * ```c
851
+ * ExtractionConfig* base = kreuzberg_config_from_json(
852
+ * "{\"use_cache\": true, \"force_ocr\": false}"
853
+ * );
854
+ * ExtractionConfig* override = kreuzberg_config_from_json(
855
+ * "{\"force_ocr\": true}"
856
+ * );
857
+ *
858
+ * if (kreuzberg_config_merge(base, override) == 1) {
859
+ * // base now has: use_cache=true, force_ocr=true
860
+ * char* json = kreuzberg_config_to_json(base);
861
+ * printf("Merged config: %s\n", json);
862
+ * kreuzberg_free_string(json);
863
+ * }
864
+ *
865
+ * kreuzberg_config_free(base);
866
+ * kreuzberg_config_free(override);
867
+ * ```
868
+ */
869
+ int32_t kreuzberg_config_merge(ExtractionConfig *base, const ExtractionConfig *override_config);
870
+
871
+ /**
872
+ * Load an ExtractionConfig from a file.
873
+ *
874
+ * Returns a JSON string representing the loaded configuration.
875
+ *
876
+ * # Safety
877
+ *
878
+ * - `file_path` must be a valid null-terminated C string
879
+ * - The returned string must be freed with `kreuzberg_free_string`
880
+ * - Returns NULL on error (check `kreuzberg_last_error`)
881
+ */
882
+ char *kreuzberg_load_extraction_config_from_file(const char *file_path);
883
+
884
+ /**
885
+ * Load an ExtractionConfig from a file (returns pointer to config struct).
886
+ *
887
+ * # Safety
888
+ *
889
+ * - `path` must be a valid null-terminated C string
890
+ * - The returned pointer must be freed with `kreuzberg_config_free`
891
+ * - Returns NULL on error (check `kreuzberg_last_error`)
892
+ *
893
+ * # Example (C)
894
+ *
895
+ * ```c
896
+ * ExtractionConfig* config = kreuzberg_config_from_file("config.toml");
897
+ * if (config == NULL) {
898
+ * printf("Error: %s\n", kreuzberg_last_error());
899
+ * return 1;
900
+ * }
901
+ * kreuzberg_config_free(config);
902
+ * ```
903
+ */
904
+ ExtractionConfig *kreuzberg_config_from_file(const char *path);
905
+
906
+ /**
907
+ * Discover and load an ExtractionConfig by searching parent directories.
908
+ *
909
+ * Searches the current directory and all parent directories for:
910
+ * - `kreuzberg.toml`
911
+ * - `kreuzberg.json`
912
+ *
913
+ * Returns the first config file found as a JSON string.
914
+ *
915
+ * # Safety
916
+ *
917
+ * - The returned string must be freed with `kreuzberg_free_string`
918
+ * - Returns NULL if no config is found or on error
919
+ *
920
+ * # Example (C)
921
+ *
922
+ * ```c
923
+ * char* config_json = kreuzberg_config_discover();
924
+ * if (config_json != NULL) {
925
+ * printf("Discovered config: %s\n", config_json);
926
+ * kreuzberg_free_string(config_json);
927
+ * }
928
+ * ```
929
+ */
930
+ char *kreuzberg_config_discover(void);
931
+
932
+ /**
933
+ * List available embedding preset names.
934
+ *
935
+ * # Safety
936
+ *
937
+ * - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
938
+ * - Returns NULL on error (check `kreuzberg_last_error`)
939
+ */
940
+ char *kreuzberg_list_embedding_presets(void);
941
+
942
+ /**
943
+ * Get a specific embedding preset by name.
944
+ *
945
+ * # Safety
946
+ *
947
+ * - `name` must be a valid null-terminated C string
948
+ * - Returned string is JSON object and must be freed with `kreuzberg_free_string`
949
+ * - Returns NULL on error (check `kreuzberg_last_error`)
950
+ */
951
+ char *kreuzberg_get_embedding_preset(const char *name);
952
+
953
+ /**
954
+ * Returns the validation error code (0).
955
+ *
956
+ * # C Signature
957
+ *
958
+ * ```c
959
+ * uint32_t kreuzberg_error_code_validation(void);
960
+ * ```
961
+ */
962
+ uint32_t kreuzberg_error_code_validation(void);
963
+
964
+ /**
965
+ * Returns the parsing error code (1).
966
+ *
967
+ * # C Signature
968
+ *
969
+ * ```c
970
+ * uint32_t kreuzberg_error_code_parsing(void);
971
+ * ```
972
+ */
973
+ uint32_t kreuzberg_error_code_parsing(void);
974
+
975
+ /**
976
+ * Returns the OCR error code (2).
977
+ *
978
+ * # C Signature
979
+ *
980
+ * ```c
981
+ * uint32_t kreuzberg_error_code_ocr(void);
982
+ * ```
983
+ */
984
+ uint32_t kreuzberg_error_code_ocr(void);
985
+
986
+ /**
987
+ * Returns the missing dependency error code (3).
988
+ *
989
+ * # C Signature
990
+ *
991
+ * ```c
992
+ * uint32_t kreuzberg_error_code_missing_dependency(void);
993
+ * ```
994
+ */
995
+ uint32_t kreuzberg_error_code_missing_dependency(void);
996
+
997
+ /**
998
+ * Returns the I/O error code (4).
999
+ *
1000
+ * # C Signature
1001
+ *
1002
+ * ```c
1003
+ * uint32_t kreuzberg_error_code_io(void);
1004
+ * ```
1005
+ */
1006
+ uint32_t kreuzberg_error_code_io(void);
1007
+
1008
+ /**
1009
+ * Returns the plugin error code (5).
1010
+ *
1011
+ * # C Signature
1012
+ *
1013
+ * ```c
1014
+ * uint32_t kreuzberg_error_code_plugin(void);
1015
+ * ```
1016
+ */
1017
+ uint32_t kreuzberg_error_code_plugin(void);
1018
+
1019
+ /**
1020
+ * Returns the unsupported format error code (6).
1021
+ *
1022
+ * # C Signature
1023
+ *
1024
+ * ```c
1025
+ * uint32_t kreuzberg_error_code_unsupported_format(void);
1026
+ * ```
1027
+ */
1028
+ uint32_t kreuzberg_error_code_unsupported_format(void);
1029
+
1030
+ /**
1031
+ * Returns the internal error code (7).
1032
+ *
1033
+ * # C Signature
1034
+ *
1035
+ * ```c
1036
+ * uint32_t kreuzberg_error_code_internal(void);
1037
+ * ```
1038
+ */
1039
+ uint32_t kreuzberg_error_code_internal(void);
1040
+
1041
+ /**
1042
+ * Returns the total count of valid error codes.
1043
+ *
1044
+ * Currently 8 error codes (0-7). This helps bindings validate error codes.
1045
+ *
1046
+ * # C Signature
1047
+ *
1048
+ * ```c
1049
+ * uint32_t kreuzberg_error_code_count(void);
1050
+ * ```
1051
+ */
1052
+ uint32_t kreuzberg_error_code_count(void);
1053
+
1054
+ /**
1055
+ * Returns the name of an error code as a C string.
1056
+ *
1057
+ * # Arguments
1058
+ *
1059
+ * - `code`: Numeric error code (0-7)
1060
+ *
1061
+ * # Returns
1062
+ *
1063
+ * Pointer to a null-terminated C string with the error name (e.g., "validation", "ocr").
1064
+ * Returns a pointer to "unknown" if the code is invalid.
1065
+ *
1066
+ * The returned pointer is valid for the lifetime of the program and should not be freed.
1067
+ *
1068
+ * # Examples
1069
+ *
1070
+ * ```c
1071
+ * const char* name = kreuzberg_error_code_name(0);
1072
+ * printf("%s\n", name); // prints: validation
1073
+ * ```
1074
+ *
1075
+ * # C Signature
1076
+ *
1077
+ * ```c
1078
+ * const char* kreuzberg_error_code_name(uint32_t code);
1079
+ * ```
1080
+ */
1081
+ const char *kreuzberg_error_code_name(uint32_t code);
1082
+
1083
+ /**
1084
+ * Returns the description of an error code as a C string.
1085
+ *
1086
+ * # Arguments
1087
+ *
1088
+ * - `code`: Numeric error code (0-7)
1089
+ *
1090
+ * # Returns
1091
+ *
1092
+ * Pointer to a null-terminated C string with a description (e.g., "Input validation error").
1093
+ * Returns a pointer to "Unknown error code" if the code is invalid.
1094
+ *
1095
+ * The returned pointer is valid for the lifetime of the program and should not be freed.
1096
+ *
1097
+ * # C Signature
1098
+ *
1099
+ * ```c
1100
+ * const char* kreuzberg_error_code_description(uint32_t code);
1101
+ * ```
1102
+ */
1103
+ const char *kreuzberg_error_code_description(uint32_t code);
1104
+
1105
+ /**
1106
+ * Retrieves detailed error information from the thread-local error storage.
1107
+ *
1108
+ * Returns structured error details including message, code, type, and source location.
1109
+ * This function queries the error state captured by FFI functions and provides
1110
+ * comprehensive error information for binding implementations.
1111
+ *
1112
+ * # Returns
1113
+ *
1114
+ * A `CErrorDetails` structure with the following characteristics:
1115
+ * - All non-NULL string pointers must be freed with `kreuzberg_free_string()`
1116
+ * - NULL pointers indicate the field is not available
1117
+ * - `error_code` is a numeric code (0-7)
1118
+ * - `source_line` is 0 if unknown
1119
+ * - `is_panic` is 1 if error originated from a panic, 0 otherwise
1120
+ *
1121
+ * # Thread Safety
1122
+ *
1123
+ * This function is thread-safe. Each thread has its own error storage.
1124
+ *
1125
+ * # Example (C)
1126
+ *
1127
+ * ```c
1128
+ * CErrorDetails details = kreuzberg_get_error_details();
1129
+ * printf("Error: %s (code=%u, type=%s)\n", details.message, details.error_code, details.error_type);
1130
+ * if (details.source_file != NULL) {
1131
+ * printf(" at %s:%u in %s\n", details.source_file, details.source_line, details.source_function);
1132
+ * }
1133
+ * kreuzberg_free_string(details.message);
1134
+ * kreuzberg_free_string(details.error_type);
1135
+ * if (details.source_file != NULL) kreuzberg_free_string(details.source_file);
1136
+ * if (details.source_function != NULL) kreuzberg_free_string(details.source_function);
1137
+ * if (details.context_info != NULL) kreuzberg_free_string(details.context_info);
1138
+ * ```
1139
+ *
1140
+ * # C Signature
1141
+ *
1142
+ * ```c
1143
+ * typedef struct {
1144
+ * char* message;
1145
+ * uint32_t error_code;
1146
+ * char* error_type;
1147
+ * char* source_file;
1148
+ * char* source_function;
1149
+ * uint32_t source_line;
1150
+ * char* context_info;
1151
+ * int is_panic;
1152
+ * } CErrorDetails;
1153
+ *
1154
+ * CErrorDetails kreuzberg_get_error_details(void);
1155
+ * ```
1156
+ */
1157
+ struct CErrorDetails kreuzberg_get_error_details(void);
1158
+
1159
+ /**
1160
+ * Classifies an error based on the error message string.
1161
+ *
1162
+ * Analyzes an error message and attempts to classify it into one of the standard
1163
+ * Kreuzberg error codes (0-7). This is useful for converting error messages from
1164
+ * external libraries or system calls into Kreuzberg error categories.
1165
+ *
1166
+ * # Arguments
1167
+ *
1168
+ * - `error_message`: Pointer to a null-terminated C string with the error message
1169
+ *
1170
+ * # Returns
1171
+ *
1172
+ * Numeric error code (0-7) indicating the most likely error classification.
1173
+ * Returns 7 (Internal) if the message cannot be reliably classified.
1174
+ *
1175
+ * # Classification Rules
1176
+ *
1177
+ * The classifier looks for common keywords and patterns:
1178
+ * - **0 (Validation)**: "invalid", "validation", "parameter", "constraint", "format mismatch"
1179
+ * - **1 (Parsing)**: "parse", "parsing", "corrupt", "unexpected", "malformed", "invalid format"
1180
+ * - **2 (OCR)**: "ocr", "tesseract", "recognition", "optical"
1181
+ * - **3 (MissingDependency)**: "not found", "missing", "dependency", "not installed", "unavailable"
1182
+ * - **4 (Io)**: "io", "file", "read", "write", "permission", "access", "disk", "exists"
1183
+ * - **5 (Plugin)**: "plugin", "loader", "registry", "extension"
1184
+ * - **6 (UnsupportedFormat)**: "unsupported", "unknown format", "MIME type"
1185
+ *
1186
+ * # Thread Safety
1187
+ *
1188
+ * This function is thread-safe and has no side effects.
1189
+ *
1190
+ * # Example (C)
1191
+ *
1192
+ * ```c
1193
+ * uint32_t code = kreuzberg_classify_error("Failed to open file: permission denied");
1194
+ * if (code == kreuzberg_error_code_io()) {
1195
+ * printf("This is an I/O error\n");
1196
+ * }
1197
+ * ```
1198
+ *
1199
+ * # Safety
1200
+ *
1201
+ * - `error_message` must be a valid null-terminated C string or NULL
1202
+ * - `error_message` must remain valid for the duration of the function call
1203
+ *
1204
+ * # C Signature
1205
+ *
1206
+ * ```c
1207
+ * uint32_t kreuzberg_classify_error(const char* error_message);
1208
+ * ```
1209
+ */
1210
+ uint32_t kreuzberg_classify_error(const char *error_message);
1211
+
1212
+ /**
1213
+ * Extract text and metadata from a file (synchronous).
1214
+ *
1215
+ * # Safety
1216
+ *
1217
+ * - `file_path` must be a valid null-terminated C string
1218
+ * - The returned pointer must be freed with `kreuzberg_free_result`
1219
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
1220
+ *
1221
+ * # Example (C)
1222
+ *
1223
+ * ```c
1224
+ * const char* path = "/path/to/document.pdf";
1225
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
1226
+ * if (result != NULL && result->success) {
1227
+ * printf("Content: %s\n", result->content);
1228
+ * printf("MIME: %s\n", result->mime_type);
1229
+ * kreuzberg_free_result(result);
1230
+ * } else {
1231
+ * const char* error = kreuzberg_last_error();
1232
+ * printf("Error: %s\n", error);
1233
+ * }
1234
+ * ```
1235
+ */
1236
+ struct CExtractionResult *kreuzberg_extract_file_sync(const char *file_path);
1237
+
1238
+ /**
1239
+ * Extract text and metadata from a file with custom configuration (synchronous).
1240
+ *
1241
+ * # Safety
1242
+ *
1243
+ * - `file_path` must be a valid null-terminated C string
1244
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
1245
+ * - The returned pointer must be freed with `kreuzberg_free_result`
1246
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
1247
+ *
1248
+ * # Example (C)
1249
+ *
1250
+ * ```c
1251
+ * const char* path = "/path/to/document.pdf";
1252
+ * const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
1253
+ * CExtractionResult* result = kreuzberg_extract_file_sync_with_config(path, config);
1254
+ * if (result != NULL && result->success) {
1255
+ * printf("Content: %s\n", result->content);
1256
+ * kreuzberg_free_result(result);
1257
+ * }
1258
+ * ```
1259
+ */
1260
+ struct CExtractionResult *kreuzberg_extract_file_sync_with_config(const char *file_path,
1261
+ const char *config_json);
1262
+
1263
+ /**
1264
+ * Extract text and metadata from byte array (synchronous).
1265
+ *
1266
+ * # Safety
1267
+ *
1268
+ * - `data` must be a valid pointer to a byte array of length `data_len`
1269
+ * - `mime_type` must be a valid null-terminated C string
1270
+ * - The returned pointer must be freed with `kreuzberg_free_result`
1271
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
1272
+ *
1273
+ * # Example (C)
1274
+ *
1275
+ * ```c
1276
+ * const uint8_t* data = ...; // Document bytes
1277
+ * size_t len = ...; // Length of data
1278
+ * const char* mime = "application/pdf";
1279
+ * CExtractionResult* result = kreuzberg_extract_bytes_sync(data, len, mime);
1280
+ * if (result != NULL && result->success) {
1281
+ * printf("Content: %s\n", result->content);
1282
+ * kreuzberg_free_result(result);
1283
+ * } else {
1284
+ * const char* error = kreuzberg_last_error();
1285
+ * printf("Error: %s\n", error);
1286
+ * }
1287
+ * ```
1288
+ */
1289
+ struct CExtractionResult *kreuzberg_extract_bytes_sync(const uint8_t *data,
1290
+ uintptr_t data_len,
1291
+ const char *mime_type);
1292
+
1293
+ /**
1294
+ * Extract text and metadata from byte array with custom configuration (synchronous).
1295
+ *
1296
+ * # Safety
1297
+ *
1298
+ * - `data` must be a valid pointer to a byte array of length `data_len`
1299
+ * - `mime_type` must be a valid null-terminated C string
1300
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
1301
+ * - The returned pointer must be freed with `kreuzberg_free_result`
1302
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
1303
+ *
1304
+ * # Example (C)
1305
+ *
1306
+ * ```c
1307
+ * const uint8_t* data = ...; // Document bytes
1308
+ * size_t len = ...; // Length of data
1309
+ * const char* mime = "application/pdf";
1310
+ * const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
1311
+ * CExtractionResult* result = kreuzberg_extract_bytes_sync_with_config(data, len, mime, config);
1312
+ * if (result != NULL && result->success) {
1313
+ * printf("Content: %s\n", result->content);
1314
+ * kreuzberg_free_result(result);
1315
+ * }
1316
+ * ```
1317
+ */
1318
+ struct CExtractionResult *kreuzberg_extract_bytes_sync_with_config(const uint8_t *data,
1319
+ uintptr_t data_len,
1320
+ const char *mime_type,
1321
+ const char *config_json);
1322
+
1323
+ /**
1324
+ * Batch extract text and metadata from multiple files (synchronous).
1325
+ *
1326
+ * # Safety
1327
+ *
1328
+ * - `file_paths` must be a valid pointer to an array of null-terminated C strings
1329
+ * - `count` must be the number of file paths in the array
1330
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
1331
+ * - The returned pointer must be freed with `kreuzberg_free_batch_result`
1332
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
1333
+ *
1334
+ * # Critical Memory Management
1335
+ *
1336
+ * This function has special memory management requirements due to the need to allocate
1337
+ * an array of result pointers:
1338
+ *
1339
+ * 1. Results are collected in a Vec<*mut CExtractionResult>
1340
+ * 2. The vec is converted to a boxed slice (changes allocation metadata)
1341
+ * 3. The boxed slice pointer is cast to *mut *mut CExtractionResult
1342
+ * 4. This pointer is stored in CBatchResult
1343
+ * 5. Deallocation must reverse this process using slice_from_raw_parts
1344
+ *
1345
+ * The Go segfault issue was caused by incorrect deallocation in the memory module.
1346
+ * This allocation pattern must be perfectly mirrored in the free function.
1347
+ */
1348
+ struct CBatchResult *kreuzberg_batch_extract_files_sync(const char *const *file_paths,
1349
+ uintptr_t count,
1350
+ const char *config_json);
1351
+
1352
+ /**
1353
+ * Batch extract text and metadata from multiple byte arrays (synchronous).
1354
+ *
1355
+ * # Safety
1356
+ *
1357
+ * - `items` must be a valid pointer to an array of CBytesWithMime structures
1358
+ * - `count` must be the number of items in the array
1359
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
1360
+ * - The returned pointer must be freed with `kreuzberg_free_batch_result`
1361
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
1362
+ *
1363
+ * # Critical Memory Management
1364
+ *
1365
+ * This function shares the same critical memory management pattern as
1366
+ * `kreuzberg_batch_extract_files_sync`. See that function's documentation
1367
+ * for details on the Box/Vec/slice allocation pattern.
1368
+ */
1369
+ struct CBatchResult *kreuzberg_batch_extract_bytes_sync(const struct CBytesWithMime *items,
1370
+ uintptr_t count,
1371
+ const char *config_json);
1372
+
1373
+ /**
1374
+ * Free a batch result returned by batch extraction functions.
1375
+ *
1376
+ * # Safety
1377
+ *
1378
+ * - `batch_result` must be a pointer previously returned by a batch extraction function
1379
+ * - `batch_result` can be NULL (no-op)
1380
+ * - `batch_result` must not be used after this call
1381
+ * - All individual results in the batch will be freed automatically
1382
+ *
1383
+ * # Memory Layout
1384
+ *
1385
+ * CRITICAL: The results array is allocated as `Box<[*mut CExtractionResult]>` (boxed slice),
1386
+ * NOT as `Vec<*mut CExtractionResult>`. We must use `Box::from_raw` with a slice pointer,
1387
+ * not `Vec::from_raw_parts`, to avoid Box/Vec mismatch that causes segfaults.
1388
+ *
1389
+ * # Example (C)
1390
+ *
1391
+ * ```c
1392
+ * CBatchResult* batch = kreuzberg_extract_batch_sync(paths, count);
1393
+ * // Use batch...
1394
+ * kreuzberg_free_batch_result(batch);
1395
+ * // batch is now invalid
1396
+ * ```
1397
+ */
1398
+ void kreuzberg_free_batch_result(struct CBatchResult *batch_result);
1399
+
1400
+ /**
1401
+ * Free a string returned by Kreuzberg functions.
1402
+ *
1403
+ * # Safety
1404
+ *
1405
+ * - `s` must be a string previously returned by a Kreuzberg function
1406
+ * - `s` can be NULL (no-op)
1407
+ * - `s` must not be used after this call
1408
+ *
1409
+ * # Example (C)
1410
+ *
1411
+ * ```c
1412
+ * char* str = result->content;
1413
+ * kreuzberg_free_string(str);
1414
+ * // str is now invalid
1415
+ * ```
1416
+ */
1417
+ void kreuzberg_free_string(char *s);
1418
+
1419
+ /**
1420
+ * Clone a null-terminated string using Rust's allocator.
1421
+ *
1422
+ * # Safety
1423
+ *
1424
+ * - `s` must be a valid null-terminated UTF-8 string
1425
+ * - Returned pointer must be freed with `kreuzberg_free_string`
1426
+ * - Returns NULL on error (check `kreuzberg_last_error`)
1427
+ */
1428
+ char *kreuzberg_clone_string(const char *s);
1429
+
1430
+ /**
1431
+ * Free an extraction result returned by `kreuzberg_extract_file_sync`.
1432
+ *
1433
+ * # Safety
1434
+ *
1435
+ * - `result` must be a pointer previously returned by `kreuzberg_extract_file_sync`
1436
+ * - `result` can be NULL (no-op)
1437
+ * - `result` must not be used after this call
1438
+ * - All string fields within the result will be freed automatically
1439
+ *
1440
+ * # Memory Layout
1441
+ *
1442
+ * This function frees all 12 string fields in CExtractionResult:
1443
+ * 1. content
1444
+ * 2. mime_type
1445
+ * 3. language
1446
+ * 4. date
1447
+ * 5. subject
1448
+ * 6. tables_json
1449
+ * 7. detected_languages_json
1450
+ * 8. metadata_json
1451
+ * 9. chunks_json
1452
+ * 10. images_json
1453
+ * 11. page_structure_json (FIXED: was missing before PR #3)
1454
+ * 12. pages_json (FIXED: was missing before PR #3)
1455
+ *
1456
+ * # Example (C)
1457
+ *
1458
+ * ```c
1459
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
1460
+ * // Use result...
1461
+ * kreuzberg_free_result(result);
1462
+ * // result is now invalid
1463
+ * ```
1464
+ */
1465
+ void kreuzberg_free_result(struct CExtractionResult *result);
1466
+
1467
+ /**
1468
+ * Detect MIME type from a file path.
1469
+ *
1470
+ * # Safety
1471
+ *
1472
+ * - `file_path` must be a valid null-terminated C string
1473
+ * - The returned string must be freed with `kreuzberg_free_string`
1474
+ * - Returns NULL on error (check `kreuzberg_last_error`)
1475
+ */
1476
+ char *kreuzberg_detect_mime_type(const char *file_path, bool check_exists);
1477
+
1478
+ /**
1479
+ * Validate that a MIME type is supported by Kreuzberg.
1480
+ *
1481
+ * # Safety
1482
+ *
1483
+ * - `mime_type` must be a valid null-terminated C string
1484
+ * - The returned string must be freed with `kreuzberg_free_string`
1485
+ * - Returns NULL on error (check `kreuzberg_last_error`)
1486
+ */
1487
+ char *kreuzberg_validate_mime_type(const char *mime_type);
1488
+
1489
+ /**
1490
+ * Detect MIME type from raw bytes.
1491
+ *
1492
+ * # Safety
1493
+ *
1494
+ * - `bytes` must point to a valid buffer of at least `len` bytes
1495
+ * - The returned string must be freed with `kreuzberg_free_string`
1496
+ * - Returns NULL on error (check `kreuzberg_last_error`)
1497
+ *
1498
+ * # Example (C)
1499
+ *
1500
+ * ```c
1501
+ * uint8_t data[512];
1502
+ * // ... read data ...
1503
+ * char* mime = kreuzberg_detect_mime_type_from_bytes(data, 512);
1504
+ * if (mime != NULL) {
1505
+ * printf("Detected MIME type: %s\n", mime);
1506
+ * kreuzberg_free_string(mime);
1507
+ * }
1508
+ * ```
1509
+ */
1510
+ char *kreuzberg_detect_mime_type_from_bytes(const uint8_t *bytes, uintptr_t len);
1511
+
1512
+ /**
1513
+ * Detect MIME type from file path (checks extension and reads file content).
1514
+ *
1515
+ * # Safety
1516
+ *
1517
+ * - `file_path` must be a valid null-terminated C string
1518
+ * - The returned string must be freed with `kreuzberg_free_string`
1519
+ * - Returns NULL on error (check `kreuzberg_last_error`)
1520
+ *
1521
+ * # Example (C)
1522
+ *
1523
+ * ```c
1524
+ * char* mime = kreuzberg_detect_mime_type_from_path("document.pdf");
1525
+ * if (mime == NULL) {
1526
+ * const char* error = kreuzberg_last_error();
1527
+ * printf("Failed to detect MIME type: %s\n", error);
1528
+ * } else {
1529
+ * printf("MIME type: %s\n", mime);
1530
+ * kreuzberg_free_string(mime);
1531
+ * }
1532
+ * ```
1533
+ */
1534
+ char *kreuzberg_detect_mime_type_from_path(const char *file_path);
1535
+
1536
+ /**
1537
+ * Get file extensions for a MIME type.
1538
+ *
1539
+ * Returns a JSON array of file extensions (e.g., ["pdf"] for "application/pdf").
1540
+ *
1541
+ * # Safety
1542
+ *
1543
+ * - `mime_type` must be a valid null-terminated C string
1544
+ * - The returned string must be freed with `kreuzberg_free_string`
1545
+ * - Returns NULL on error (check `kreuzberg_last_error`)
1546
+ *
1547
+ * # Example (C)
1548
+ *
1549
+ * ```c
1550
+ * char* extensions = kreuzberg_get_extensions_for_mime("application/pdf");
1551
+ * if (extensions != NULL) {
1552
+ * printf("Extensions: %s\n", extensions);
1553
+ * kreuzberg_free_string(extensions);
1554
+ * }
1555
+ * ```
1556
+ */
1557
+ char *kreuzberg_get_extensions_for_mime(const char *mime_type);
1558
+
1559
+ /**
1560
+ * Register a custom DocumentExtractor via FFI callback.
1561
+ *
1562
+ * # Safety
1563
+ *
1564
+ * - `name` must be a valid null-terminated C string
1565
+ * - `callback` must be a valid function pointer that:
1566
+ * - Does not store the content, mime_type, or config_json pointers
1567
+ * - Returns a null-terminated UTF-8 JSON string or NULL on error
1568
+ * - The returned string must be freeable by kreuzberg_free_string
1569
+ * - `mime_types` must be a valid null-terminated C string containing comma-separated MIME types
1570
+ * - `priority` determines the order of selection (higher priority preferred)
1571
+ * - Returns true on success, false on error (check kreuzberg_last_error)
1572
+ *
1573
+ * # Example (C)
1574
+ *
1575
+ * ```c
1576
+ * char* my_extractor(const uint8_t* content, size_t len, const char* mime_type, const char* config) {
1577
+ * // Extract content from bytes, return JSON ExtractionResult
1578
+ * return strdup("{\"content\":\"extracted text\",\"mime_type\":\"text/plain\",\"metadata\":{}}");
1579
+ * }
1580
+ *
1581
+ * bool success = kreuzberg_register_document_extractor(
1582
+ * "my-extractor",
1583
+ * my_extractor,
1584
+ * "application/x-custom,text/x-custom",
1585
+ * 100
1586
+ * );
1587
+ * if (!success) {
1588
+ * const char* error = kreuzberg_last_error();
1589
+ * printf("Failed to register: %s\n", error);
1590
+ * }
1591
+ * ```
1592
+ */
1593
+ bool kreuzberg_register_document_extractor(const char *name,
1594
+ DocumentExtractorCallback callback,
1595
+ const char *mime_types,
1596
+ int32_t priority);
1597
+
1598
+ /**
1599
+ * Unregister a DocumentExtractor by name.
1600
+ *
1601
+ * # Safety
1602
+ *
1603
+ * - `name` must be a valid null-terminated C string
1604
+ * - Returns true on success, false on error (check kreuzberg_last_error)
1605
+ *
1606
+ * # Example (C)
1607
+ *
1608
+ * ```c
1609
+ * bool success = kreuzberg_unregister_document_extractor("my-extractor");
1610
+ * if (!success) {
1611
+ * const char* error = kreuzberg_last_error();
1612
+ * printf("Failed to unregister: %s\n", error);
1613
+ * }
1614
+ * ```
1615
+ */
1616
+ bool kreuzberg_unregister_document_extractor(const char *name);
1617
+
1618
+ /**
1619
+ * List all registered DocumentExtractors as a JSON array of names.
1620
+ *
1621
+ * # Safety
1622
+ *
1623
+ * - Returned string must be freed with `kreuzberg_free_string`.
1624
+ * - Returns NULL on error (check `kreuzberg_last_error`).
1625
+ */
1626
+ char *kreuzberg_list_document_extractors(void);
1627
+
1628
+ /**
1629
+ * Clear all registered DocumentExtractors.
1630
+ *
1631
+ * # Safety
1632
+ *
1633
+ * - Removes all registered extractors. Subsequent extractions will use only built-in extractors.
1634
+ * - Returns true on success, false on error.
1635
+ *
1636
+ * # Example (C)
1637
+ *
1638
+ * ```c
1639
+ * bool success = kreuzberg_clear_document_extractors();
1640
+ * if (!success) {
1641
+ * const char* error = kreuzberg_last_error();
1642
+ * printf("Failed to clear document extractors: %s\n", error);
1643
+ * }
1644
+ * ```
1645
+ */
1646
+ bool kreuzberg_clear_document_extractors(void);
1647
+
1648
+ /**
1649
+ * Register a custom OCR backend via FFI callback.
1650
+ *
1651
+ * # Safety
1652
+ *
1653
+ * - `name` must be a valid null-terminated C string
1654
+ * - `callback` must be a valid function pointer that:
1655
+ * - Does not store the image_bytes pointer
1656
+ * - Returns a null-terminated UTF-8 string or NULL on error
1657
+ * - The returned string must be freeable by kreuzberg_free_string
1658
+ * - Returns true on success, false on error (check kreuzberg_last_error)
1659
+ *
1660
+ * # Example (C)
1661
+ *
1662
+ * ```c
1663
+ * char* my_ocr_backend(const uint8_t* image_bytes, size_t image_length, const char* config_json) {
1664
+ * // Implement OCR logic here
1665
+ * // Return allocated string with result, or NULL on error
1666
+ * return strdup("Extracted text");
1667
+ * }
1668
+ *
1669
+ * bool success = kreuzberg_register_ocr_backend("my-ocr", my_ocr_backend);
1670
+ * if (!success) {
1671
+ * const char* error = kreuzberg_last_error();
1672
+ * printf("Failed to register: %s\n", error);
1673
+ * }
1674
+ * ```
1675
+ */
1676
+ bool kreuzberg_register_ocr_backend(const char *name, OcrBackendCallback callback);
1677
+
1678
+ /**
1679
+ * Register a custom OCR backend with explicit language support via FFI callback.
1680
+ *
1681
+ * # Safety
1682
+ *
1683
+ * - `languages_json` must be a null-terminated JSON array of language codes or NULL
1684
+ * - See `kreuzberg_register_ocr_backend` for additional safety notes.
1685
+ */
1686
+ bool kreuzberg_register_ocr_backend_with_languages(const char *name,
1687
+ OcrBackendCallback callback,
1688
+ const char *languages_json);
1689
+
1690
+ /**
1691
+ * Unregister an OCR backend by name.
1692
+ *
1693
+ * # Safety
1694
+ *
1695
+ * - `name` must be a valid null-terminated C string
1696
+ * - Returns true on success, false on error (check kreuzberg_last_error)
1697
+ *
1698
+ * # Example (C)
1699
+ *
1700
+ * ```c
1701
+ * bool success = kreuzberg_unregister_ocr_backend("custom-ocr");
1702
+ * if (!success) {
1703
+ * const char* error = kreuzberg_last_error();
1704
+ * printf("Failed to unregister: %s\n", error);
1705
+ * }
1706
+ * ```
1707
+ */
1708
+ bool kreuzberg_unregister_ocr_backend(const char *name);
1709
+
1710
+ /**
1711
+ * List all registered OCR backends as a JSON array of names.
1712
+ *
1713
+ * # Safety
1714
+ *
1715
+ * - Returned string must be freed with `kreuzberg_free_string`.
1716
+ * - Returns NULL on error (check `kreuzberg_last_error`).
1717
+ *
1718
+ * # Example (C)
1719
+ *
1720
+ * ```c
1721
+ * char* backends = kreuzberg_list_ocr_backends();
1722
+ * if (backends == NULL) {
1723
+ * const char* error = kreuzberg_last_error();
1724
+ * printf("Failed to list backends: %s\n", error);
1725
+ * } else {
1726
+ * printf("OCR backends: %s\n", backends);
1727
+ * kreuzberg_free_string(backends);
1728
+ * }
1729
+ * ```
1730
+ */
1731
+ char *kreuzberg_list_ocr_backends(void);
1732
+
1733
+ /**
1734
+ * Clear all registered OCR backends.
1735
+ *
1736
+ * # Safety
1737
+ *
1738
+ * - Removes all registered OCR backends. Subsequent extractions will use only built-in backends.
1739
+ * - Returns true on success, false on error.
1740
+ *
1741
+ * # Example (C)
1742
+ *
1743
+ * ```c
1744
+ * bool success = kreuzberg_clear_ocr_backends();
1745
+ * if (!success) {
1746
+ * const char* error = kreuzberg_last_error();
1747
+ * printf("Failed to clear OCR backends: %s\n", error);
1748
+ * }
1749
+ * ```
1750
+ */
1751
+ bool kreuzberg_clear_ocr_backends(void);
1752
+
1753
+ /**
1754
+ * Get supported languages for an OCR backend.
1755
+ *
1756
+ * Returns a JSON array of supported language codes for the given backend.
1757
+ * Supported backends: "easyocr", "paddleocr", "tesseract"
1758
+ *
1759
+ * # Safety
1760
+ *
1761
+ * - The returned string must be freed with `kreuzberg_free_string`
1762
+ * - Returns NULL if backend not found or on error (check `kreuzberg_last_error`)
1763
+ *
1764
+ * # Example (C)
1765
+ *
1766
+ * ```c
1767
+ * char* languages = kreuzberg_get_ocr_languages("easyocr");
1768
+ * if (languages != NULL) {
1769
+ * printf("EasyOCR languages: %s\n", languages);
1770
+ * kreuzberg_free_string(languages);
1771
+ * }
1772
+ * ```
1773
+ */
1774
+ char *kreuzberg_get_ocr_languages(const char *backend);
1775
+
1776
+ /**
1777
+ * Check if a language is supported by an OCR backend.
1778
+ *
1779
+ * Returns 1 (true) if the language is supported, 0 (false) otherwise.
1780
+ *
1781
+ * # Arguments
1782
+ *
1783
+ * * `backend` - Backend name (e.g., "easyocr", "paddleocr", "tesseract")
1784
+ * * `language` - Language code to check
1785
+ *
1786
+ * # Returns
1787
+ *
1788
+ * 1 if supported, 0 if not supported or backend not found.
1789
+ *
1790
+ * # Example (C)
1791
+ *
1792
+ * ```c
1793
+ * int is_supported = kreuzberg_is_language_supported("easyocr", "en");
1794
+ * if (is_supported) {
1795
+ * printf("English is supported by EasyOCR\n");
1796
+ * }
1797
+ * ```
1798
+ *
1799
+ * # Safety
1800
+ *
1801
+ * - `backend` and `language` must be valid pointers to valid UTF-8 C strings.
1802
+ * - Both pointers can be checked for NULL; returns 0 if either is NULL.
1803
+ * - The C strings must remain valid for the duration of the function call.
1804
+ */
1805
+ int32_t kreuzberg_is_language_supported(const char *backend, const char *language);
1806
+
1807
+ /**
1808
+ * Get list of all registered OCR backends with language support.
1809
+ *
1810
+ * Returns a JSON object mapping backend names to language counts.
1811
+ * Example: `{"easyocr": 80, "paddleocr": 14, "tesseract": 100}`
1812
+ *
1813
+ * # Safety
1814
+ *
1815
+ * - The returned string must be freed with `kreuzberg_free_string`
1816
+ * - Returns NULL on error (check `kreuzberg_last_error`)
1817
+ *
1818
+ * # Example (C)
1819
+ *
1820
+ * ```c
1821
+ * char* backends = kreuzberg_list_ocr_backends_with_languages();
1822
+ * if (backends != NULL) {
1823
+ * printf("Available backends: %s\n", backends);
1824
+ * kreuzberg_free_string(backends);
1825
+ * }
1826
+ * ```
1827
+ */
1828
+ char *kreuzberg_list_ocr_backends_with_languages(void);
1829
+
1830
+ /**
1831
+ * Register a custom PostProcessor via FFI callback.
1832
+ *
1833
+ * # Safety
1834
+ *
1835
+ * - `name` must be a valid null-terminated C string
1836
+ * - `callback` must be a valid function pointer that:
1837
+ * - Does not store the result_json pointer
1838
+ * - Returns a null-terminated UTF-8 JSON string or NULL on error
1839
+ * - The returned string must be freeable by kreuzberg_free_string
1840
+ * - `priority` determines the order of execution (higher priority runs first)
1841
+ * - Returns true on success, false on error (check kreuzberg_last_error)
1842
+ *
1843
+ * # Example (C)
1844
+ *
1845
+ * ```c
1846
+ * char* my_post_processor(const char* result_json) {
1847
+ * // Parse result_json, modify it, return JSON string
1848
+ * return strdup("{\"content\":\"PROCESSED\"}");
1849
+ * }
1850
+ *
1851
+ * bool success = kreuzberg_register_post_processor("my-processor", my_post_processor, 100);
1852
+ * if (!success) {
1853
+ * const char* error = kreuzberg_last_error();
1854
+ * printf("Failed to register: %s\n", error);
1855
+ * }
1856
+ * ```
1857
+ */
1858
+ bool kreuzberg_register_post_processor(const char *name,
1859
+ PostProcessorCallback callback,
1860
+ int32_t priority);
1861
+
1862
+ /**
1863
+ * Register a custom PostProcessor with an explicit processing stage.
1864
+ *
1865
+ * # Safety
1866
+ *
1867
+ * - `name` must be a valid null-terminated C string
1868
+ * - `stage` must be a valid null-terminated C string containing "early", "middle", or "late"
1869
+ * - `callback` must be a valid function pointer that:
1870
+ * - Does not store the result_json pointer
1871
+ * - Returns a null-terminated UTF-8 JSON string or NULL on error
1872
+ * - The returned string must be freeable by kreuzberg_free_string
1873
+ * - `priority` determines the order of execution within the stage (higher priority runs first)
1874
+ * - Returns true on success, false on error (check kreuzberg_last_error)
1875
+ */
1876
+ bool kreuzberg_register_post_processor_with_stage(const char *name,
1877
+ PostProcessorCallback callback,
1878
+ int32_t priority,
1879
+ const char *stage);
1880
+
1881
+ /**
1882
+ * Unregister a PostProcessor by name.
1883
+ *
1884
+ * # Safety
1885
+ *
1886
+ * - `name` must be a valid null-terminated C string
1887
+ * - Returns true on success, false on error (check kreuzberg_last_error)
1888
+ *
1889
+ * # Example (C)
1890
+ *
1891
+ * ```c
1892
+ * bool success = kreuzberg_unregister_post_processor("my-processor");
1893
+ * if (!success) {
1894
+ * const char* error = kreuzberg_last_error();
1895
+ * printf("Failed to unregister: %s\n", error);
1896
+ * }
1897
+ * ```
1898
+ */
1899
+ bool kreuzberg_unregister_post_processor(const char *name);
1900
+
1901
+ /**
1902
+ * Clear all registered PostProcessors.
1903
+ *
1904
+ * # Safety
1905
+ *
1906
+ * - Removes all registered processors. Subsequent extractions will run without them.
1907
+ * - Returns true on success, false on error.
1908
+ */
1909
+ bool kreuzberg_clear_post_processors(void);
1910
+
1911
+ /**
1912
+ * List all registered PostProcessors as a JSON array of names.
1913
+ *
1914
+ * # Safety
1915
+ *
1916
+ * - Returned string must be freed with `kreuzberg_free_string`.
1917
+ * - Returns NULL on error (check `kreuzberg_last_error`).
1918
+ */
1919
+ char *kreuzberg_list_post_processors(void);
1920
+
1921
+ /**
1922
+ * Register a custom Validator via FFI callback.
1923
+ *
1924
+ * # Safety
1925
+ *
1926
+ * - `name` must be a valid null-terminated C string
1927
+ * - `callback` must be a valid function pointer that:
1928
+ * - Does not store the result_json pointer
1929
+ * - Returns a null-terminated UTF-8 string (error message) if validation fails
1930
+ * - Returns NULL if validation passes
1931
+ * - The returned string must be freeable by kreuzberg_free_string
1932
+ * - `priority` determines the order of validation (higher priority runs first)
1933
+ * - Returns true on success, false on error (check kreuzberg_last_error)
1934
+ *
1935
+ * # Example (C)
1936
+ *
1937
+ * ```c
1938
+ * char* my_validator(const char* result_json) {
1939
+ * // Parse result_json, validate it
1940
+ * // Return error message if validation fails, NULL if passes
1941
+ * if (invalid) {
1942
+ * return strdup("Validation failed: content too short");
1943
+ * }
1944
+ * return NULL;
1945
+ * }
1946
+ *
1947
+ * bool success = kreuzberg_register_validator("my-validator", my_validator, 100);
1948
+ * if (!success) {
1949
+ * const char* error = kreuzberg_last_error();
1950
+ * printf("Failed to register: %s\n", error);
1951
+ * }
1952
+ * ```
1953
+ */
1954
+ bool kreuzberg_register_validator(const char *name, ValidatorCallback callback, int32_t priority);
1955
+
1956
+ /**
1957
+ * Unregister a Validator by name.
1958
+ *
1959
+ * # Safety
1960
+ *
1961
+ * - `name` must be a valid null-terminated C string
1962
+ * - Returns true on success, false on error (check kreuzberg_last_error)
1963
+ *
1964
+ * # Example (C)
1965
+ *
1966
+ * ```c
1967
+ * bool success = kreuzberg_unregister_validator("my-validator");
1968
+ * if (!success) {
1969
+ * const char* error = kreuzberg_last_error();
1970
+ * printf("Failed to unregister: %s\n", error);
1971
+ * }
1972
+ * ```
1973
+ */
1974
+ bool kreuzberg_unregister_validator(const char *name);
1975
+
1976
+ /**
1977
+ * Clear all registered Validators.
1978
+ *
1979
+ * # Safety
1980
+ *
1981
+ * - Removes all validators. Subsequent extractions will skip custom validation.
1982
+ * - Returns true on success, false on error.
1983
+ */
1984
+ bool kreuzberg_clear_validators(void);
1985
+
1986
+ /**
1987
+ * List all registered Validators as a JSON array of names.
1988
+ *
1989
+ * # Safety
1990
+ *
1991
+ * - Returned string must be freed with `kreuzberg_free_string`.
1992
+ * - Returns NULL on error (check `kreuzberg_last_error`).
1993
+ */
1994
+ char *kreuzberg_list_validators(void);
1995
+
1996
+ /**
1997
+ * Get page count from extraction result.
1998
+ *
1999
+ * Returns the total number of pages/slides/sheets detected in the document.
2000
+ *
2001
+ * # Arguments
2002
+ *
2003
+ * * `result` - Pointer to an ExtractionResult structure
2004
+ *
2005
+ * # Returns
2006
+ *
2007
+ * The page count (>= 0) if successful, or -1 on error (check `kreuzberg_last_error`).
2008
+ *
2009
+ * # Safety
2010
+ *
2011
+ * - `result` must be a valid pointer to an ExtractionResult
2012
+ * - `result` cannot be NULL
2013
+ *
2014
+ * # Example (C)
2015
+ *
2016
+ * ```c
2017
+ * ExtractionResult* result = kreuzberg_extract_file("document.pdf", NULL);
2018
+ * if (result != NULL) {
2019
+ * int page_count = kreuzberg_result_get_page_count(result);
2020
+ * if (page_count >= 0) {
2021
+ * printf("Document has %d pages\n", page_count);
2022
+ * }
2023
+ * kreuzberg_result_free(result);
2024
+ * }
2025
+ * ```
2026
+ */
2027
+ int32_t kreuzberg_result_get_page_count(const ExtractionResult *result);
2028
+
2029
+ /**
2030
+ * Get chunk count from extraction result.
2031
+ *
2032
+ * Returns the number of text chunks when chunking is enabled, or 0 if chunking
2033
+ * was not performed.
2034
+ *
2035
+ * # Arguments
2036
+ *
2037
+ * * `result` - Pointer to an ExtractionResult structure
2038
+ *
2039
+ * # Returns
2040
+ *
2041
+ * The chunk count (>= 0) if successful, or -1 on error (check `kreuzberg_last_error`).
2042
+ *
2043
+ * # Safety
2044
+ *
2045
+ * - `result` must be a valid pointer to an ExtractionResult
2046
+ * - `result` cannot be NULL
2047
+ *
2048
+ * # Example (C)
2049
+ *
2050
+ * ```c
2051
+ * ExtractionResult* result = kreuzberg_extract_file("document.pdf", config);
2052
+ * if (result != NULL) {
2053
+ * int chunk_count = kreuzberg_result_get_chunk_count(result);
2054
+ * if (chunk_count >= 0) {
2055
+ * printf("Document has %d chunks\n", chunk_count);
2056
+ * }
2057
+ * kreuzberg_result_free(result);
2058
+ * }
2059
+ * ```
2060
+ */
2061
+ int32_t kreuzberg_result_get_chunk_count(const ExtractionResult *result);
2062
+
2063
+ /**
2064
+ * Get detected language from extraction result.
2065
+ *
2066
+ * Returns the primary detected language as an ISO 639 language code.
2067
+ * If multiple languages were detected, returns the primary one.
2068
+ *
2069
+ * # Arguments
2070
+ *
2071
+ * * `result` - Pointer to an ExtractionResult structure
2072
+ *
2073
+ * # Returns
2074
+ *
2075
+ * A pointer to a C string containing the language code (e.g., "en", "de"),
2076
+ * or NULL if no language was detected or on error (check `kreuzberg_last_error`).
2077
+ *
2078
+ * The returned pointer must be freed with `kreuzberg_free_string()`.
2079
+ *
2080
+ * # Safety
2081
+ *
2082
+ * - `result` must be a valid pointer to an ExtractionResult
2083
+ * - `result` cannot be NULL
2084
+ * - The returned pointer (if non-NULL) must be freed with `kreuzberg_free_string`
2085
+ *
2086
+ * # Example (C)
2087
+ *
2088
+ * ```c
2089
+ * ExtractionResult* result = kreuzberg_extract_file("document.pdf", NULL);
2090
+ * if (result != NULL) {
2091
+ * char* language = kreuzberg_result_get_detected_language(result);
2092
+ * if (language != NULL) {
2093
+ * printf("Detected language: %s\n", language);
2094
+ * kreuzberg_free_string(language);
2095
+ * }
2096
+ * kreuzberg_result_free(result);
2097
+ * }
2098
+ * ```
2099
+ */
2100
+ char *kreuzberg_result_get_detected_language(const ExtractionResult *result);
2101
+
2102
+ /**
2103
+ * Get a metadata field by name.
2104
+ *
2105
+ * Retrieves a metadata field from the extraction result and returns its value
2106
+ * as a JSON string. Supports nested fields with dot notation (e.g., "format.pages").
2107
+ *
2108
+ * # Arguments
2109
+ *
2110
+ * * `result` - Pointer to an ExtractionResult structure
2111
+ * * `field_name` - Null-terminated C string with the field name
2112
+ *
2113
+ * # Returns
2114
+ *
2115
+ * A CMetadataField structure containing:
2116
+ * - `name`: The field name (caller should not free)
2117
+ * - `json_value`: Pointer to field value as JSON string (must free with `kreuzberg_free_string`),
2118
+ * or NULL if field doesn't exist
2119
+ * - `is_null`: 1 if field doesn't exist, 0 if it does
2120
+ *
2121
+ * # Safety
2122
+ *
2123
+ * - `result` must be a valid pointer to an ExtractionResult
2124
+ * - `field_name` must be a valid null-terminated C string
2125
+ * - Neither parameter can be NULL
2126
+ * - The returned `json_value` (if non-NULL) must be freed with `kreuzberg_free_string`
2127
+ *
2128
+ * # Example (C)
2129
+ *
2130
+ * ```c
2131
+ * ExtractionResult* result = kreuzberg_extract_file("document.pdf", NULL);
2132
+ * if (result != NULL) {
2133
+ * CMetadataField title_field = kreuzberg_result_get_metadata_field(result, "title");
2134
+ * if (!title_field.is_null) {
2135
+ * printf("Title: %s\n", title_field.json_value);
2136
+ * kreuzberg_free_string(title_field.json_value);
2137
+ * }
2138
+ *
2139
+ * CMetadataField author_field = kreuzberg_result_get_metadata_field(result, "authors");
2140
+ * if (!author_field.is_null) {
2141
+ * printf("Authors: %s\n", author_field.json_value);
2142
+ * kreuzberg_free_string(author_field.json_value);
2143
+ * }
2144
+ *
2145
+ * kreuzberg_result_free(result);
2146
+ * }
2147
+ * ```
2148
+ */
2149
+ struct CMetadataField kreuzberg_result_get_metadata_field(const ExtractionResult *result,
2150
+ const char *field_name);
2151
+
2152
+ /**
2153
+ * Create a new result pool with specified initial capacity.
2154
+ *
2155
+ * Pre-allocates storage for `capacity` results to reduce allocation overhead.
2156
+ * Pool automatically grows if capacity is exceeded.
2157
+ *
2158
+ * # Arguments
2159
+ *
2160
+ * * `capacity` - Initial capacity (number of results to pre-allocate storage for)
2161
+ *
2162
+ * # Returns
2163
+ *
2164
+ * Pointer to allocated pool, or NULL on allocation failure (check `kreuzberg_last_error`).
2165
+ *
2166
+ * # Memory Management
2167
+ *
2168
+ * Caller must free the returned pool with `kreuzberg_result_pool_free()`.
2169
+ *
2170
+ * # Example (C)
2171
+ *
2172
+ * ```c
2173
+ * CResultPool* pool = kreuzberg_result_pool_new(100);
2174
+ * if (pool == NULL) {
2175
+ * fprintf(stderr, "Failed to create pool: %s\n", kreuzberg_last_error());
2176
+ * return;
2177
+ * }
2178
+ * // Use pool...
2179
+ * kreuzberg_result_pool_free(pool);
2180
+ * ```
2181
+ */
2182
+ struct ResultPool *kreuzberg_result_pool_new(uintptr_t capacity);
2183
+
2184
+ /**
2185
+ * Reset pool by clearing all results.
2186
+ *
2187
+ * Removes all results from the pool but retains allocated capacity.
2188
+ * After reset, pool can be reused for new extractions.
2189
+ *
2190
+ * # Arguments
2191
+ *
2192
+ * * `pool` - Pointer to result pool
2193
+ *
2194
+ * # Safety
2195
+ *
2196
+ * - `pool` must be a valid pointer returned by `kreuzberg_result_pool_new()`
2197
+ * - `pool` cannot be NULL
2198
+ * - All result pointers obtained from this pool become invalid after reset
2199
+ * - Must not be called concurrently with extractions using same pool
2200
+ *
2201
+ * # Example (C)
2202
+ *
2203
+ * ```c
2204
+ * CResultPool* pool = kreuzberg_result_pool_new(100);
2205
+ *
2206
+ * // Process batch 1
2207
+ * for (int i = 0; i < 50; i++) {
2208
+ * kreuzberg_extract_file_into_pool(files[i], NULL, pool);
2209
+ * }
2210
+ *
2211
+ * // Reset and reuse
2212
+ * kreuzberg_result_pool_reset(pool);
2213
+ *
2214
+ * // Process batch 2
2215
+ * for (int i = 0; i < 50; i++) {
2216
+ * kreuzberg_extract_file_into_pool(other_files[i], NULL, pool);
2217
+ * }
2218
+ *
2219
+ * kreuzberg_result_pool_free(pool);
2220
+ * ```
2221
+ */
2222
+ void kreuzberg_result_pool_reset(struct ResultPool *pool);
2223
+
2224
+ /**
2225
+ * Free result pool and all contained results.
2226
+ *
2227
+ * Releases all memory associated with the pool. All result pointers
2228
+ * obtained from this pool become invalid.
2229
+ *
2230
+ * # Arguments
2231
+ *
2232
+ * * `pool` - Pointer to result pool
2233
+ *
2234
+ * # Safety
2235
+ *
2236
+ * - `pool` must be a valid pointer returned by `kreuzberg_result_pool_new()`
2237
+ * - `pool` can be NULL (no-op)
2238
+ * - All result pointers from this pool become invalid after free
2239
+ * - Must not be called twice on same pool (double-free)
2240
+ * - Must not be called concurrently with other pool operations
2241
+ *
2242
+ * # Example (C)
2243
+ *
2244
+ * ```c
2245
+ * CResultPool* pool = kreuzberg_result_pool_new(100);
2246
+ * // Use pool...
2247
+ * kreuzberg_result_pool_free(pool);
2248
+ * pool = NULL; // Prevent double-free
2249
+ * ```
2250
+ */
2251
+ void kreuzberg_result_pool_free(struct ResultPool *pool);
2252
+
2253
+ /**
2254
+ * Get statistics about pool usage and efficiency.
2255
+ *
2256
+ * Returns metrics about current pool state, allocation counts, and memory usage.
2257
+ *
2258
+ * # Arguments
2259
+ *
2260
+ * * `pool` - Pointer to result pool
2261
+ *
2262
+ * # Returns
2263
+ *
2264
+ * Statistics structure with current metrics, or zeroed structure on error.
2265
+ *
2266
+ * # Safety
2267
+ *
2268
+ * - `pool` must be a valid pointer returned by `kreuzberg_result_pool_new()`
2269
+ * - `pool` cannot be NULL
2270
+ *
2271
+ * # Example (C)
2272
+ *
2273
+ * ```c
2274
+ * CResultPoolStats stats = kreuzberg_result_pool_stats(pool);
2275
+ * printf("Pool: %zu/%zu results, %zu allocations, %zu bytes\n",
2276
+ * stats.current_count, stats.capacity,
2277
+ * stats.total_allocations, stats.estimated_memory_bytes);
2278
+ *
2279
+ * if (stats.growth_events > 0) {
2280
+ * printf("Warning: Pool grew %zu times (consider larger initial capacity)\n",
2281
+ * stats.growth_events);
2282
+ * }
2283
+ * ```
2284
+ */
2285
+ struct CResultPoolStats kreuzberg_result_pool_stats(const struct ResultPool *pool);
2286
+
2287
+ /**
2288
+ * Extract file and store result in pool.
2289
+ *
2290
+ * Extracts document content and adds result to pool. Returns borrowed reference
2291
+ * to result that remains valid until pool is reset or freed.
2292
+ *
2293
+ * # Arguments
2294
+ *
2295
+ * * `file_path` - Null-terminated UTF-8 file path
2296
+ * * `config_json` - Optional JSON configuration string (NULL for defaults)
2297
+ * * `pool` - Pointer to result pool
2298
+ *
2299
+ * # Returns
2300
+ *
2301
+ * Borrowed pointer to extraction result view, or NULL on error (check `kreuzberg_last_error`).
2302
+ * Result remains valid until pool is reset or freed.
2303
+ *
2304
+ * # Safety
2305
+ *
2306
+ * - `file_path` must be valid null-terminated UTF-8 string
2307
+ * - `config_json` must be valid null-terminated UTF-8 if not NULL
2308
+ * - `pool` must be valid pointer returned by `kreuzberg_result_pool_new()`
2309
+ * - None can be NULL (except config_json which is optional)
2310
+ * - Returned pointer is borrowed from pool (do not free separately)
2311
+ * - Returned pointer becomes invalid when pool is reset or freed
2312
+ *
2313
+ * # Example (C)
2314
+ *
2315
+ * ```c
2316
+ * CResultPool* pool = kreuzberg_result_pool_new(100);
2317
+ *
2318
+ * const CExtractionResultView* result = kreuzberg_extract_file_into_pool(
2319
+ * "document.pdf", NULL, pool
2320
+ * );
2321
+ *
2322
+ * if (result != NULL) {
2323
+ * // Access result fields
2324
+ * printf("Content length: %zu\n", result->content_len);
2325
+ * printf("MIME type: %.*s\n",
2326
+ * (int)result->mime_type_len,
2327
+ * result->mime_type_ptr);
2328
+ * }
2329
+ *
2330
+ * // Result remains valid until pool is reset/freed
2331
+ * kreuzberg_result_pool_free(pool);
2332
+ * ```
2333
+ */
2334
+ const struct CExtractionResultView *kreuzberg_extract_file_into_pool(const char *file_path,
2335
+ const char *config_json,
2336
+ struct ResultPool *pool);
2337
+
2338
+ /**
2339
+ * Extract file into pool and get zero-copy view.
2340
+ *
2341
+ * Convenience function that combines extraction and view creation.
2342
+ * Equivalent to `kreuzberg_extract_file_into_pool()` followed by
2343
+ * `kreuzberg_get_result_view()`.
2344
+ *
2345
+ * # Arguments
2346
+ *
2347
+ * Same as `kreuzberg_extract_file_into_pool()`
2348
+ *
2349
+ * # Returns
2350
+ *
2351
+ * Zero-copy view of result, or zeroed view on error.
2352
+ *
2353
+ * # Safety
2354
+ *
2355
+ * Same requirements as `kreuzberg_extract_file_into_pool()`.
2356
+ * View is valid until pool is reset or freed.
2357
+ */
2358
+ struct CExtractionResultView kreuzberg_extract_file_into_pool_view(const char *file_path,
2359
+ const char *config_json,
2360
+ struct ResultPool *pool);
2361
+
2362
+ /**
2363
+ * Get a zero-copy view of an extraction result.
2364
+ *
2365
+ * Creates a view structure with direct pointers to result data without allocation.
2366
+ * The view is valid only while the source `result` remains valid.
2367
+ *
2368
+ * # Arguments
2369
+ *
2370
+ * * `result` - Pointer to an ExtractionResult structure
2371
+ * * `out_view` - Pointer to a CExtractionResultView structure to populate
2372
+ *
2373
+ * # Returns
2374
+ *
2375
+ * 0 on success, -1 on error (check `kreuzberg_last_error`).
2376
+ *
2377
+ * # Safety
2378
+ *
2379
+ * - `result` must be a valid pointer to an ExtractionResult
2380
+ * - `out_view` must be a valid pointer to writable memory
2381
+ * - Neither parameter can be NULL
2382
+ * - The returned view is valid ONLY while `result` is not freed
2383
+ * - Caller MUST NOT use the view after calling `kreuzberg_result_free(result)`
2384
+ *
2385
+ * # Lifetime Safety
2386
+ *
2387
+ * ```text
2388
+ * ExtractionResult lifetime: |-------------------------------------|
2389
+ * View lifetime: |----------------------|
2390
+ * SAFE FREE → INVALID
2391
+ * ```
2392
+ *
2393
+ * # Example (C)
2394
+ *
2395
+ * ```c
2396
+ * ExtractionResult* result = kreuzberg_extract_file("document.pdf", NULL);
2397
+ * if (result != NULL) {
2398
+ * CExtractionResultView view;
2399
+ * if (kreuzberg_get_result_view(result, &view) == 0) {
2400
+ * // Direct access to content without copying
2401
+ * printf("Content length: %zu bytes\n", view.content_len);
2402
+ * printf("MIME type: %.*s\n", (int)view.mime_type_len, view.mime_type_ptr);
2403
+ * printf("Tables: %zu, Chunks: %zu\n", view.table_count, view.chunk_count);
2404
+ *
2405
+ * // No need to free the view (no allocations)
2406
+ * }
2407
+ *
2408
+ * kreuzberg_result_free(result); // After this, view is INVALID
2409
+ * }
2410
+ * ```
2411
+ */
2412
+ int32_t kreuzberg_get_result_view(const ExtractionResult *result,
2413
+ struct CExtractionResultView *out_view);
2414
+
2415
+ /**
2416
+ * Get direct access to content from a result view.
2417
+ *
2418
+ * Helper function to retrieve content as a slice without copying.
2419
+ *
2420
+ * # Arguments
2421
+ *
2422
+ * * `view` - Pointer to a CExtractionResultView structure
2423
+ * * `out_ptr` - Pointer to receive the content pointer
2424
+ * * `out_len` - Pointer to receive the content length
2425
+ *
2426
+ * # Returns
2427
+ *
2428
+ * 0 on success, -1 on error (check `kreuzberg_last_error`).
2429
+ *
2430
+ * # Safety
2431
+ *
2432
+ * - `view` must be a valid pointer to a CExtractionResultView
2433
+ * - `out_ptr` and `out_len` must be valid writable pointers
2434
+ * - The returned content pointer is valid only while the source ExtractionResult is valid
2435
+ *
2436
+ * # Example (C)
2437
+ *
2438
+ * ```c
2439
+ * const uint8_t* content;
2440
+ * size_t content_len;
2441
+ * if (kreuzberg_view_get_content(&view, &content, &content_len) == 0) {
2442
+ * // Process content directly without copying
2443
+ * fwrite(content, 1, content_len, stdout);
2444
+ * }
2445
+ * ```
2446
+ */
2447
+ int32_t kreuzberg_view_get_content(const struct CExtractionResultView *view,
2448
+ const uint8_t **out_ptr,
2449
+ uintptr_t *out_len);
2450
+
2451
+ /**
2452
+ * Get direct access to MIME type from a result view.
2453
+ *
2454
+ * # Arguments
2455
+ *
2456
+ * * `view` - Pointer to a CExtractionResultView structure
2457
+ * * `out_ptr` - Pointer to receive the MIME type pointer
2458
+ * * `out_len` - Pointer to receive the MIME type length
2459
+ *
2460
+ * # Returns
2461
+ *
2462
+ * 0 on success, -1 on error (check `kreuzberg_last_error`).
2463
+ *
2464
+ * # Safety
2465
+ *
2466
+ * - `view` must be a valid pointer to a CExtractionResultView
2467
+ * - `out_ptr` and `out_len` must be valid writable pointers
2468
+ * - The returned MIME type pointer is valid only while the source ExtractionResult is valid
2469
+ *
2470
+ * # Example (C)
2471
+ *
2472
+ * ```c
2473
+ * const uint8_t* mime_type;
2474
+ * size_t mime_len;
2475
+ * if (kreuzberg_view_get_mime_type(&view, &mime_type, &mime_len) == 0) {
2476
+ * printf("MIME: %.*s\n", (int)mime_len, mime_type);
2477
+ * }
2478
+ * ```
2479
+ */
2480
+ int32_t kreuzberg_view_get_mime_type(const struct CExtractionResultView *view,
2481
+ const uint8_t **out_ptr,
2482
+ uintptr_t *out_len);
2483
+
2484
+ /**
2485
+ * Intern a string and return pointer to shared C string.
2486
+ *
2487
+ * If the string has already been interned, returns pointer to existing allocation.
2488
+ * Otherwise, creates new allocation. Pointer remains valid until all references
2489
+ * are freed with `kreuzberg_free_interned_string()`.
2490
+ *
2491
+ * # Arguments
2492
+ *
2493
+ * * `s` - Null-terminated UTF-8 string to intern
2494
+ *
2495
+ * # Returns
2496
+ *
2497
+ * Pointer to interned C string, or NULL on error (invalid UTF-8, allocation failure).
2498
+ * Caller must eventually free with `kreuzberg_free_interned_string()`.
2499
+ *
2500
+ * # Reference Counting
2501
+ *
2502
+ * Multiple calls with the same string return the same pointer but increment
2503
+ * an internal reference count. The string is freed only when all references
2504
+ * are released.
2505
+ *
2506
+ * # Thread Safety
2507
+ *
2508
+ * Thread-safe. Multiple threads can call concurrently.
2509
+ *
2510
+ * # Safety
2511
+ *
2512
+ * - `s` must be valid null-terminated UTF-8 string
2513
+ * - `s` cannot be NULL
2514
+ * - Returned pointer must not be modified
2515
+ * - Caller must call `kreuzberg_free_interned_string()` for each `kreuzberg_intern_string()` call
2516
+ *
2517
+ * # Example (C)
2518
+ *
2519
+ * ```c
2520
+ * const char* mime1 = kreuzberg_intern_string("application/pdf");
2521
+ * const char* mime2 = kreuzberg_intern_string("application/pdf");
2522
+ *
2523
+ * // Same string = same pointer (memory shared)
2524
+ * assert(mime1 == mime2);
2525
+ *
2526
+ * // Free each reference
2527
+ * kreuzberg_free_interned_string(mime1);
2528
+ * kreuzberg_free_interned_string(mime2);
2529
+ * ```
2530
+ */
2531
+ const char *kreuzberg_intern_string(const char *s);
2532
+
2533
+ /**
2534
+ * Free an interned string reference.
2535
+ *
2536
+ * Decrements reference count for the interned string. If reference count
2537
+ * reaches zero, the string is freed from the intern table.
2538
+ *
2539
+ * # Arguments
2540
+ *
2541
+ * * `s` - Pointer returned by `kreuzberg_intern_string()`
2542
+ *
2543
+ * # Safety
2544
+ *
2545
+ * - `s` must be a pointer returned by `kreuzberg_intern_string()`
2546
+ * - `s` can be NULL (no-op)
2547
+ * - Must not be called twice on same pointer (double-free)
2548
+ * - Pointer becomes invalid after last reference is freed
2549
+ *
2550
+ * # Example (C)
2551
+ *
2552
+ * ```c
2553
+ * const char* mime = kreuzberg_intern_string("application/pdf");
2554
+ * // Use mime...
2555
+ * kreuzberg_free_interned_string(mime);
2556
+ * // Don't use mime after this point
2557
+ * ```
2558
+ */
2559
+ void kreuzberg_free_interned_string(const char *s);
2560
+
2561
+ /**
2562
+ * Get statistics about string interning efficiency.
2563
+ *
2564
+ * Returns metrics about unique strings, cache hits/misses, and memory savings.
2565
+ *
2566
+ * # Returns
2567
+ *
2568
+ * Statistics structure with current metrics.
2569
+ *
2570
+ * # Example (C)
2571
+ *
2572
+ * ```c
2573
+ * CStringInternStats stats = kreuzberg_string_intern_stats();
2574
+ * printf("Interned: %zu unique strings\n", stats.unique_count);
2575
+ * printf("Requests: %zu total (%zu hits, %zu misses)\n",
2576
+ * stats.total_requests, stats.cache_hits, stats.cache_misses);
2577
+ * printf("Memory saved: %zu bytes\n", stats.estimated_memory_saved);
2578
+ * printf("Hit rate: %.1f%%\n",
2579
+ * 100.0 * stats.cache_hits / stats.total_requests);
2580
+ * ```
2581
+ */
2582
+ struct CStringInternStats kreuzberg_string_intern_stats(void);
2583
+
2584
+ /**
2585
+ * Reset the intern table, freeing all interned strings.
2586
+ *
2587
+ * **WARNING**: This invalidates all pointers returned by `kreuzberg_intern_string()`.
2588
+ * Only use during shutdown or testing.
2589
+ *
2590
+ * # Safety
2591
+ *
2592
+ * - Must not be called while any interned string pointers are in use
2593
+ * - All existing interned pointers become invalid
2594
+ * - Thread-safe but can race with concurrent intern operations
2595
+ */
2596
+ void kreuzberg_string_intern_reset(void);
2597
+
2598
+ /**
2599
+ * Get the last error message from a failed operation.
2600
+ *
2601
+ * # Safety
2602
+ *
2603
+ * - Returns a static string that does not need to be freed
2604
+ * - Returns NULL if no error has occurred
2605
+ * - The returned string is valid until the next Kreuzberg function call on the same thread
2606
+ *
2607
+ * # Example (C)
2608
+ *
2609
+ * ```c
2610
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
2611
+ * if (result == NULL) {
2612
+ * const char* error = kreuzberg_last_error();
2613
+ * printf("Error: %s\n", error);
2614
+ * }
2615
+ * ```
2616
+ */
2617
+ const char *kreuzberg_last_error(void);
2618
+
2619
+ /**
2620
+ * Get the error code for the last error.
2621
+ *
2622
+ * Returns the error code as an i32. Error codes are defined in ErrorCode enum:
2623
+ * - 0: Success (no error)
2624
+ * - 1: GenericError
2625
+ * - 2: Panic
2626
+ * - 3: InvalidArgument
2627
+ * - 4: IoError
2628
+ * - 5: ParsingError
2629
+ * - 6: OcrError
2630
+ * - 7: MissingDependency
2631
+ *
2632
+ * # Safety
2633
+ *
2634
+ * This function is thread-safe and always safe to call.
2635
+ *
2636
+ * # Example (C)
2637
+ *
2638
+ * ```c
2639
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
2640
+ * if (result == NULL) {
2641
+ * int32_t code = kreuzberg_last_error_code();
2642
+ * if (code == 2) {
2643
+ * // A panic occurred
2644
+ * }
2645
+ * }
2646
+ * ```
2647
+ */
2648
+ int32_t kreuzberg_last_error_code(void);
2649
+
2650
+ /**
2651
+ * Get the panic context for the last error (if it was a panic).
2652
+ *
2653
+ * Returns a JSON object with panic details including:
2654
+ * - file: Source file where panic occurred
2655
+ * - line: Line number in source file
2656
+ * - function: Name of the function that panicked
2657
+ * - message: Panic message
2658
+ * - timestamp_secs: Unix timestamp when panic occurred
2659
+ *
2660
+ * # Safety
2661
+ *
2662
+ * - The returned string must be freed with `kreuzberg_free_string`
2663
+ * - Returns NULL if the last error was not a panic or no error has occurred
2664
+ *
2665
+ * # Example (C)
2666
+ *
2667
+ * ```c
2668
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
2669
+ * if (result == NULL && kreuzberg_last_error_code() == 2) {
2670
+ * char* context = kreuzberg_last_panic_context();
2671
+ * if (context != NULL) {
2672
+ * printf("Panic context: %s\n", context);
2673
+ * kreuzberg_free_string(context);
2674
+ * }
2675
+ * }
2676
+ * ```
2677
+ */
2678
+ char *kreuzberg_last_panic_context(void);
2679
+
2680
+ /**
2681
+ * Get the library version string.
2682
+ *
2683
+ * # Safety
2684
+ *
2685
+ * - Returns a static string that does not need to be freed
2686
+ * - The returned string is always valid
2687
+ *
2688
+ * # Example (C)
2689
+ *
2690
+ * ```c
2691
+ * const char* version = kreuzberg_version();
2692
+ * printf("Kreuzberg version: %s\n", version);
2693
+ * ```
2694
+ */
2695
+ const char *kreuzberg_version(void);
2696
+
2697
+ /**
2698
+ * Validates a binarization method string.
2699
+ *
2700
+ * # Arguments
2701
+ *
2702
+ * * `method` - C string containing the binarization method (e.g., "otsu", "adaptive", "sauvola")
2703
+ *
2704
+ * # Returns
2705
+ *
2706
+ * - `1` if valid
2707
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2708
+ *
2709
+ * # Safety
2710
+ *
2711
+ * * `method` must be a valid pointer to a null-terminated UTF-8 string
2712
+ * * `method` cannot be NULL
2713
+ * * The string must be valid for the duration of the call
2714
+ *
2715
+ * # C Signature
2716
+ *
2717
+ * ```c
2718
+ * int32_t kreuzberg_validate_binarization_method(const char* method);
2719
+ * ```
2720
+ */
2721
+ int32_t kreuzberg_validate_binarization_method(const char *method);
2722
+
2723
+ /**
2724
+ * Validates an OCR backend string.
2725
+ *
2726
+ * # Arguments
2727
+ *
2728
+ * * `backend` - C string containing the OCR backend (e.g., "tesseract", "easyocr", "paddleocr")
2729
+ *
2730
+ * # Returns
2731
+ *
2732
+ * - `1` if valid
2733
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2734
+ *
2735
+ * # Safety
2736
+ *
2737
+ * * `backend` must be a valid pointer to a null-terminated UTF-8 string
2738
+ * * `backend` cannot be NULL
2739
+ * * The string must be valid for the duration of the call
2740
+ *
2741
+ * # C Signature
2742
+ *
2743
+ * ```c
2744
+ * int32_t kreuzberg_validate_ocr_backend(const char* backend);
2745
+ * ```
2746
+ */
2747
+ int32_t kreuzberg_validate_ocr_backend(const char *backend);
2748
+
2749
+ /**
2750
+ * Validates a language code (ISO 639-1 or 639-3 format).
2751
+ *
2752
+ * Accepts both 2-letter codes (e.g., "en", "de") and 3-letter codes (e.g., "eng", "deu").
2753
+ *
2754
+ * # Arguments
2755
+ *
2756
+ * * `code` - C string containing the language code
2757
+ *
2758
+ * # Returns
2759
+ *
2760
+ * - `1` if valid
2761
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2762
+ *
2763
+ * # Safety
2764
+ *
2765
+ * * `code` must be a valid pointer to a null-terminated UTF-8 string
2766
+ * * `code` cannot be NULL
2767
+ * * The string must be valid for the duration of the call
2768
+ *
2769
+ * # C Signature
2770
+ *
2771
+ * ```c
2772
+ * int32_t kreuzberg_validate_language_code(const char* code);
2773
+ * ```
2774
+ */
2775
+ int32_t kreuzberg_validate_language_code(const char *code);
2776
+
2777
+ /**
2778
+ * Validates a token reduction level string.
2779
+ *
2780
+ * # Arguments
2781
+ *
2782
+ * * `level` - C string containing the token reduction level (e.g., "off", "light", "moderate")
2783
+ *
2784
+ * # Returns
2785
+ *
2786
+ * - `1` if valid
2787
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2788
+ *
2789
+ * # Safety
2790
+ *
2791
+ * * `level` must be a valid pointer to a null-terminated UTF-8 string
2792
+ * * `level` cannot be NULL
2793
+ * * The string must be valid for the duration of the call
2794
+ *
2795
+ * # C Signature
2796
+ *
2797
+ * ```c
2798
+ * int32_t kreuzberg_validate_token_reduction_level(const char* level);
2799
+ * ```
2800
+ */
2801
+ int32_t kreuzberg_validate_token_reduction_level(const char *level);
2802
+
2803
+ /**
2804
+ * Validates a tesseract Page Segmentation Mode (PSM) value.
2805
+ *
2806
+ * # Arguments
2807
+ *
2808
+ * * `psm` - PSM value (valid range: 0-13)
2809
+ *
2810
+ * # Returns
2811
+ *
2812
+ * - `1` if valid
2813
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2814
+ *
2815
+ * # C Signature
2816
+ *
2817
+ * ```c
2818
+ * int32_t kreuzberg_validate_tesseract_psm(int32_t psm);
2819
+ * ```
2820
+ */
2821
+ int32_t kreuzberg_validate_tesseract_psm(int32_t psm);
2822
+
2823
+ /**
2824
+ * Validates a tesseract OCR Engine Mode (OEM) value.
2825
+ *
2826
+ * # Arguments
2827
+ *
2828
+ * * `oem` - OEM value (valid range: 0-3)
2829
+ *
2830
+ * # Returns
2831
+ *
2832
+ * - `1` if valid
2833
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2834
+ *
2835
+ * # C Signature
2836
+ *
2837
+ * ```c
2838
+ * int32_t kreuzberg_validate_tesseract_oem(int32_t oem);
2839
+ * ```
2840
+ */
2841
+ int32_t kreuzberg_validate_tesseract_oem(int32_t oem);
2842
+
2843
+ /**
2844
+ * Validates a tesseract output format string.
2845
+ *
2846
+ * # Arguments
2847
+ *
2848
+ * * `format` - C string containing the output format (e.g., "text", "markdown")
2849
+ *
2850
+ * # Returns
2851
+ *
2852
+ * - `1` if valid
2853
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2854
+ *
2855
+ * # Safety
2856
+ *
2857
+ * * `format` must be a valid pointer to a null-terminated UTF-8 string
2858
+ * * `format` cannot be NULL
2859
+ * * The string must be valid for the duration of the call
2860
+ *
2861
+ * # C Signature
2862
+ *
2863
+ * ```c
2864
+ * int32_t kreuzberg_validate_output_format(const char* format);
2865
+ * ```
2866
+ */
2867
+ int32_t kreuzberg_validate_output_format(const char *format);
2868
+
2869
+ /**
2870
+ * Validates a confidence threshold value.
2871
+ *
2872
+ * Confidence thresholds must be between 0.0 and 1.0 inclusive.
2873
+ *
2874
+ * # Arguments
2875
+ *
2876
+ * * `confidence` - Confidence threshold value
2877
+ *
2878
+ * # Returns
2879
+ *
2880
+ * - `1` if valid
2881
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2882
+ *
2883
+ * # C Signature
2884
+ *
2885
+ * ```c
2886
+ * int32_t kreuzberg_validate_confidence(double confidence);
2887
+ * ```
2888
+ */
2889
+ int32_t kreuzberg_validate_confidence(double confidence);
2890
+
2891
+ /**
2892
+ * Validates a DPI (dots per inch) value.
2893
+ *
2894
+ * DPI must be a positive integer, typically 72-600.
2895
+ *
2896
+ * # Arguments
2897
+ *
2898
+ * * `dpi` - DPI value
2899
+ *
2900
+ * # Returns
2901
+ *
2902
+ * - `1` if valid
2903
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2904
+ *
2905
+ * # C Signature
2906
+ *
2907
+ * ```c
2908
+ * int32_t kreuzberg_validate_dpi(int32_t dpi);
2909
+ * ```
2910
+ */
2911
+ int32_t kreuzberg_validate_dpi(int32_t dpi);
2912
+
2913
+ /**
2914
+ * Validates chunking parameters.
2915
+ *
2916
+ * Checks that `max_chars > 0` and `max_overlap < max_chars`.
2917
+ *
2918
+ * # Arguments
2919
+ *
2920
+ * * `max_chars` - Maximum characters per chunk
2921
+ * * `max_overlap` - Maximum overlap between chunks
2922
+ *
2923
+ * # Returns
2924
+ *
2925
+ * - `1` if valid
2926
+ * - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
2927
+ *
2928
+ * # C Signature
2929
+ *
2930
+ * ```c
2931
+ * int32_t kreuzberg_validate_chunking_params(size_t max_chars, size_t max_overlap);
2932
+ * ```
2933
+ */
2934
+ int32_t kreuzberg_validate_chunking_params(uintptr_t max_chars, uintptr_t max_overlap);
2935
+
2936
+ /**
2937
+ * Returns valid binarization methods as a JSON array string.
2938
+ *
2939
+ * The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
2940
+ *
2941
+ * # Returns
2942
+ *
2943
+ * A pointer to a dynamically allocated C string containing a JSON array of valid methods.
2944
+ * Returns NULL if memory allocation fails (error message set via `set_last_error()`).
2945
+ *
2946
+ * # Example
2947
+ *
2948
+ * The returned JSON string looks like: `["otsu","adaptive","sauvola"]`
2949
+ *
2950
+ * # C Signature
2951
+ *
2952
+ * ```c
2953
+ * char* kreuzberg_get_valid_binarization_methods(void);
2954
+ * ```
2955
+ */
2956
+ char *kreuzberg_get_valid_binarization_methods(void);
2957
+
2958
+ /**
2959
+ * Returns valid language codes as a JSON array string.
2960
+ *
2961
+ * The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
2962
+ *
2963
+ * # Returns
2964
+ *
2965
+ * A pointer to a dynamically allocated C string containing a JSON array of valid codes.
2966
+ * Returns NULL if memory allocation fails (error message set via `set_last_error()`).
2967
+ *
2968
+ * # C Signature
2969
+ *
2970
+ * ```c
2971
+ * char* kreuzberg_get_valid_language_codes(void);
2972
+ * ```
2973
+ */
2974
+ char *kreuzberg_get_valid_language_codes(void);
2975
+
2976
+ /**
2977
+ * Returns valid OCR backends as a JSON array string.
2978
+ *
2979
+ * The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
2980
+ *
2981
+ * # Returns
2982
+ *
2983
+ * A pointer to a dynamically allocated C string containing a JSON array of valid backends.
2984
+ * Returns NULL if memory allocation fails (error message set via `set_last_error()`).
2985
+ *
2986
+ * # C Signature
2987
+ *
2988
+ * ```c
2989
+ * char* kreuzberg_get_valid_ocr_backends(void);
2990
+ * ```
2991
+ */
2992
+ char *kreuzberg_get_valid_ocr_backends(void);
2993
+
2994
+ /**
2995
+ * Returns valid token reduction levels as a JSON array string.
2996
+ *
2997
+ * The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
2998
+ *
2999
+ * # Returns
3000
+ *
3001
+ * A pointer to a dynamically allocated C string containing a JSON array of valid levels.
3002
+ * Returns NULL if memory allocation fails (error message set via `set_last_error()`).
3003
+ *
3004
+ * # C Signature
3005
+ *
3006
+ * ```c
3007
+ * char* kreuzberg_get_valid_token_reduction_levels(void);
3008
+ * ```
3009
+ */
3010
+ char *kreuzberg_get_valid_token_reduction_levels(void);
3011
+
3012
+ #endif /* KREUZBERG_FFI_H */