kreuzberg 4.0.0.rc2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +391 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -0,0 +1,1341 @@
1
+ //! Centralized FFI configuration parsing module.
2
+ //!
3
+ //! This module consolidates all configuration parsing logic that was previously
4
+ //! duplicated across all language bindings (Python, TypeScript, Ruby, Java, Go, C#).
5
+ //!
6
+ //! Instead of each binding reimplementing config parsing from JSON, they now
7
+ //! call the FFI functions provided here, ensuring:
8
+ //! - Single source of truth for validation rules
9
+ //! - Consistent behavior across all languages
10
+ //! - Elimination of drift/inconsistencies
11
+ //! - Better performance (no JSON round-trips in language bindings)
12
+
13
+ use crate::ffi_panic_guard;
14
+ use crate::helpers::{clear_last_error, set_last_error, string_to_c_string};
15
+ use kreuzberg::KreuzbergError;
16
+ use kreuzberg::core::config::ExtractionConfig;
17
+ use serde::Serialize;
18
+ use std::ffi::{CStr, CString};
19
+ use std::os::raw::c_char;
20
+ use std::path::Path;
21
+ use std::ptr;
22
+
23
+ type FfiResult<T> = std::result::Result<T, String>;
24
+
25
+ /// Parse an ExtractionConfig from a JSON string.
26
+ ///
27
+ /// This is the primary FFI entry point for all language bindings to parse
28
+ /// configuration from JSON. Replaces the need for each binding to implement
29
+ /// its own JSON parsing logic.
30
+ ///
31
+ /// # Arguments
32
+ ///
33
+ /// * `json_config` - Null-terminated C string containing JSON configuration
34
+ ///
35
+ /// # Returns
36
+ ///
37
+ /// A pointer to an ExtractionConfig struct that MUST be freed with
38
+ /// `kreuzberg_config_free`, or NULL on error (check kreuzberg_last_error).
39
+ ///
40
+ /// # Safety
41
+ ///
42
+ /// - `json_config` must be a valid null-terminated C string
43
+ /// - The returned pointer must be freed with `kreuzberg_config_free`
44
+ /// - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
45
+ ///
46
+ /// # Example (C)
47
+ ///
48
+ /// ```c
49
+ /// const char* config_json = "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}";
50
+ /// ExtractionConfig* config = kreuzberg_config_from_json(config_json);
51
+ /// if (config == NULL) {
52
+ /// printf("Error: %s\n", kreuzberg_last_error());
53
+ /// return 1;
54
+ /// }
55
+ ///
56
+ /// // Use config...
57
+ /// // char* result = kreuzberg_extract_file_with_config("doc.pdf", config);
58
+ ///
59
+ /// kreuzberg_config_free(config);
60
+ /// ```
61
+ #[unsafe(no_mangle)]
62
+ pub unsafe extern "C" fn kreuzberg_config_from_json(json_config: *const c_char) -> *mut ExtractionConfig {
63
+ if json_config.is_null() {
64
+ set_last_error("Config JSON cannot be NULL".to_string());
65
+ return ptr::null_mut();
66
+ }
67
+
68
+ clear_last_error();
69
+
70
+ let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
71
+ Ok(s) => s,
72
+ Err(e) => {
73
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
74
+ return ptr::null_mut();
75
+ }
76
+ };
77
+
78
+ match parse_extraction_config_from_json(json_str) {
79
+ Ok(config) => Box::into_raw(Box::new(config)),
80
+ Err(e) => {
81
+ set_last_error(e);
82
+ ptr::null_mut()
83
+ }
84
+ }
85
+ }
86
+
87
+ /// Free an ExtractionConfig allocated by kreuzberg_config_from_json or similar.
88
+ ///
89
+ /// # Safety
90
+ ///
91
+ /// - `config` must be a pointer previously returned by a config creation function
92
+ /// - `config` can be NULL (no-op)
93
+ /// - `config` must not be used after this call
94
+ ///
95
+ /// # Example (C)
96
+ ///
97
+ /// ```c
98
+ /// ExtractionConfig* config = kreuzberg_config_from_json("{...}");
99
+ /// if (config != NULL) {
100
+ /// // Use config...
101
+ /// kreuzberg_config_free(config);
102
+ /// }
103
+ /// ```
104
+ #[unsafe(no_mangle)]
105
+ pub unsafe extern "C" fn kreuzberg_config_free(config: *mut ExtractionConfig) {
106
+ if !config.is_null() {
107
+ let _ = unsafe { Box::from_raw(config) };
108
+ }
109
+ }
110
+
111
+ /// Validate a JSON config string without parsing it.
112
+ ///
113
+ /// This function checks if a JSON config string is valid and would parse correctly,
114
+ /// without allocating the full ExtractionConfig structure. Useful for validation
115
+ /// before committing to parsing.
116
+ ///
117
+ /// # Arguments
118
+ ///
119
+ /// * `json_config` - Null-terminated C string containing JSON configuration
120
+ ///
121
+ /// # Returns
122
+ ///
123
+ /// - 1 if valid (would parse successfully)
124
+ /// - 0 if invalid (check `kreuzberg_last_error` for details)
125
+ ///
126
+ /// # Safety
127
+ ///
128
+ /// - `json_config` must be a valid null-terminated C string
129
+ ///
130
+ /// # Example (C)
131
+ ///
132
+ /// ```c
133
+ /// const char* config_json = "{\"use_cache\": true}";
134
+ /// if (kreuzberg_config_is_valid(config_json)) {
135
+ /// ExtractionConfig* config = kreuzberg_config_from_json(config_json);
136
+ /// // Use config...
137
+ /// kreuzberg_config_free(config);
138
+ /// } else {
139
+ /// printf("Invalid config: %s\n", kreuzberg_last_error());
140
+ /// }
141
+ /// ```
142
+ #[unsafe(no_mangle)]
143
+ pub unsafe extern "C" fn kreuzberg_config_is_valid(json_config: *const c_char) -> i32 {
144
+ if json_config.is_null() {
145
+ set_last_error("Config JSON cannot be NULL".to_string());
146
+ return 0;
147
+ }
148
+
149
+ clear_last_error();
150
+
151
+ let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
152
+ Ok(s) => s,
153
+ Err(e) => {
154
+ set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
155
+ return 0;
156
+ }
157
+ };
158
+
159
+ match parse_extraction_config_from_json(json_str) {
160
+ Ok(_) => 1,
161
+ Err(e) => {
162
+ set_last_error(e);
163
+ 0
164
+ }
165
+ }
166
+ }
167
+
168
+ /// Serialize an ExtractionConfig to JSON string.
169
+ ///
170
+ /// Converts an ExtractionConfig structure to its JSON representation, allowing
171
+ /// bindings to serialize configs without reimplementing serialization logic.
172
+ ///
173
+ /// # Arguments
174
+ ///
175
+ /// * `config` - Pointer to an ExtractionConfig structure
176
+ ///
177
+ /// # Returns
178
+ ///
179
+ /// A pointer to a C string containing JSON that MUST be freed with `kreuzberg_free_string`.
180
+ /// Returns NULL on error (check `kreuzberg_last_error`).
181
+ ///
182
+ /// # Safety
183
+ ///
184
+ /// - `config` must be a valid pointer to an ExtractionConfig
185
+ /// - `config` cannot be NULL
186
+ /// - The returned pointer must be freed with `kreuzberg_free_string`
187
+ ///
188
+ /// # Example (C)
189
+ ///
190
+ /// ```c
191
+ /// ExtractionConfig* config = kreuzberg_config_from_json("{\"use_cache\": true}");
192
+ /// if (config != NULL) {
193
+ /// char* json = kreuzberg_config_to_json(config);
194
+ /// if (json != NULL) {
195
+ /// printf("Serialized: %s\n", json);
196
+ /// kreuzberg_free_string(json);
197
+ /// }
198
+ /// kreuzberg_config_free(config);
199
+ /// }
200
+ /// ```
201
+ #[unsafe(no_mangle)]
202
+ pub unsafe extern "C" fn kreuzberg_config_to_json(config: *const ExtractionConfig) -> *mut c_char {
203
+ if config.is_null() {
204
+ set_last_error("Config cannot be NULL".to_string());
205
+ return ptr::null_mut();
206
+ }
207
+
208
+ clear_last_error();
209
+
210
+ match serde_json::to_string(unsafe { &*config }) {
211
+ Ok(json) => match std::ffi::CString::new(json) {
212
+ Ok(c_string) => c_string.into_raw(),
213
+ Err(e) => {
214
+ set_last_error(format!("Failed to convert JSON to C string: {}", e));
215
+ ptr::null_mut()
216
+ }
217
+ },
218
+ Err(e) => {
219
+ set_last_error(format!("Failed to serialize config to JSON: {}", e));
220
+ ptr::null_mut()
221
+ }
222
+ }
223
+ }
224
+
225
+ /// Get a specific field from config as JSON string.
226
+ ///
227
+ /// Retrieves a nested field from the configuration by path and returns its JSON
228
+ /// representation. Supports dot notation for nested fields (e.g., "ocr.backend").
229
+ ///
230
+ /// # Arguments
231
+ ///
232
+ /// * `config` - Pointer to an ExtractionConfig structure
233
+ /// * `field_name` - Null-terminated C string with field path (e.g., "use_cache", "ocr.backend")
234
+ ///
235
+ /// # Returns
236
+ ///
237
+ /// A pointer to a C string containing the field value as JSON, or NULL if:
238
+ /// - The field doesn't exist
239
+ /// - An error occurs during serialization
240
+ ///
241
+ /// The returned pointer (if non-NULL) must be freed with `kreuzberg_free_string`.
242
+ ///
243
+ /// # Safety
244
+ ///
245
+ /// - `config` must be a valid pointer to an ExtractionConfig
246
+ /// - `field_name` must be a valid null-terminated C string
247
+ /// - Neither parameter can be NULL
248
+ ///
249
+ /// # Example (C)
250
+ ///
251
+ /// ```c
252
+ /// ExtractionConfig* config = kreuzberg_config_from_json(
253
+ /// "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}"
254
+ /// );
255
+ /// if (config != NULL) {
256
+ /// char* use_cache = kreuzberg_config_get_field(config, "use_cache");
257
+ /// char* backend = kreuzberg_config_get_field(config, "ocr.backend");
258
+ ///
259
+ /// if (use_cache != NULL) {
260
+ /// printf("use_cache: %s\n", use_cache);
261
+ /// kreuzberg_free_string(use_cache);
262
+ /// }
263
+ ///
264
+ /// if (backend != NULL) {
265
+ /// printf("backend: %s\n", backend);
266
+ /// kreuzberg_free_string(backend);
267
+ /// }
268
+ ///
269
+ /// kreuzberg_config_free(config);
270
+ /// }
271
+ /// ```
272
+ #[unsafe(no_mangle)]
273
+ pub unsafe extern "C" fn kreuzberg_config_get_field(
274
+ config: *const ExtractionConfig,
275
+ field_name: *const c_char,
276
+ ) -> *mut c_char {
277
+ if config.is_null() {
278
+ set_last_error("Config cannot be NULL".to_string());
279
+ return ptr::null_mut();
280
+ }
281
+
282
+ if field_name.is_null() {
283
+ set_last_error("Field name cannot be NULL".to_string());
284
+ return ptr::null_mut();
285
+ }
286
+
287
+ clear_last_error();
288
+
289
+ let field_str = match unsafe { CStr::from_ptr(field_name) }.to_str() {
290
+ Ok(s) => s,
291
+ Err(e) => {
292
+ set_last_error(format!("Invalid UTF-8 in field name: {}", e));
293
+ return ptr::null_mut();
294
+ }
295
+ };
296
+
297
+ let json_value = match serde_json::to_value(unsafe { &*config }) {
298
+ Ok(val) => val,
299
+ Err(e) => {
300
+ set_last_error(format!("Failed to serialize config: {}", e));
301
+ return ptr::null_mut();
302
+ }
303
+ };
304
+
305
+ let mut current = &json_value;
306
+ for part in field_str.split('.') {
307
+ if let Some(obj) = current.as_object() {
308
+ match obj.get(part) {
309
+ Some(val) => current = val,
310
+ None => {
311
+ set_last_error(format!("Field '{}' not found in config", field_str));
312
+ return ptr::null_mut();
313
+ }
314
+ }
315
+ } else {
316
+ set_last_error(format!("Cannot access nested field '{}' in non-object", part));
317
+ return ptr::null_mut();
318
+ }
319
+ }
320
+
321
+ match serde_json::to_string(current) {
322
+ Ok(json) => match std::ffi::CString::new(json) {
323
+ Ok(c_string) => c_string.into_raw(),
324
+ Err(e) => {
325
+ set_last_error(format!("Failed to convert field value to C string: {}", e));
326
+ ptr::null_mut()
327
+ }
328
+ },
329
+ Err(e) => {
330
+ set_last_error(format!("Failed to serialize field value: {}", e));
331
+ ptr::null_mut()
332
+ }
333
+ }
334
+ }
335
+
336
+ /// Merge two configs (override takes precedence over base).
337
+ ///
338
+ /// Performs a shallow merge of two ExtractionConfig structures, where fields
339
+ /// from `override_config` take precedence over fields in `base`. The `base`
340
+ /// config is modified in-place.
341
+ ///
342
+ /// # Arguments
343
+ ///
344
+ /// * `base` - Pointer to the base ExtractionConfig (will be modified)
345
+ /// * `override_config` - Pointer to the override ExtractionConfig (read-only)
346
+ ///
347
+ /// # Returns
348
+ ///
349
+ /// - 1 on success
350
+ /// - 0 on error (check `kreuzberg_last_error`)
351
+ ///
352
+ /// # Safety
353
+ ///
354
+ /// - `base` must be a valid mutable pointer to an ExtractionConfig
355
+ /// - `override_config` must be a valid pointer to an ExtractionConfig
356
+ /// - Neither parameter can be NULL
357
+ /// - `base` is modified in-place
358
+ ///
359
+ /// # Example (C)
360
+ ///
361
+ /// ```c
362
+ /// ExtractionConfig* base = kreuzberg_config_from_json(
363
+ /// "{\"use_cache\": true, \"force_ocr\": false}"
364
+ /// );
365
+ /// ExtractionConfig* override = kreuzberg_config_from_json(
366
+ /// "{\"force_ocr\": true}"
367
+ /// );
368
+ ///
369
+ /// if (kreuzberg_config_merge(base, override) == 1) {
370
+ /// // base now has: use_cache=true, force_ocr=true
371
+ /// char* json = kreuzberg_config_to_json(base);
372
+ /// printf("Merged config: %s\n", json);
373
+ /// kreuzberg_free_string(json);
374
+ /// }
375
+ ///
376
+ /// kreuzberg_config_free(base);
377
+ /// kreuzberg_config_free(override);
378
+ /// ```
379
+ #[unsafe(no_mangle)]
380
+ pub unsafe extern "C" fn kreuzberg_config_merge(
381
+ base: *mut ExtractionConfig,
382
+ override_config: *const ExtractionConfig,
383
+ ) -> i32 {
384
+ if base.is_null() {
385
+ set_last_error("Base config cannot be NULL".to_string());
386
+ return 0;
387
+ }
388
+
389
+ if override_config.is_null() {
390
+ set_last_error("Override config cannot be NULL".to_string());
391
+ return 0;
392
+ }
393
+
394
+ clear_last_error();
395
+
396
+ let base_ref = unsafe { &mut *base };
397
+ let override_ref = unsafe { &*override_config };
398
+
399
+ base_ref.use_cache = override_ref.use_cache;
400
+ base_ref.enable_quality_processing = override_ref.enable_quality_processing;
401
+ base_ref.force_ocr = override_ref.force_ocr;
402
+ base_ref.max_concurrent_extractions = override_ref.max_concurrent_extractions;
403
+
404
+ if override_ref.ocr.is_some() {
405
+ base_ref.ocr = override_ref.ocr.clone();
406
+ }
407
+
408
+ if override_ref.chunking.is_some() {
409
+ base_ref.chunking = override_ref.chunking.clone();
410
+ }
411
+
412
+ if override_ref.images.is_some() {
413
+ base_ref.images = override_ref.images.clone();
414
+ }
415
+
416
+ #[cfg(feature = "pdf")]
417
+ if override_ref.pdf_options.is_some() {
418
+ base_ref.pdf_options = override_ref.pdf_options.clone();
419
+ }
420
+
421
+ if override_ref.token_reduction.is_some() {
422
+ base_ref.token_reduction = override_ref.token_reduction.clone();
423
+ }
424
+
425
+ if override_ref.language_detection.is_some() {
426
+ base_ref.language_detection = override_ref.language_detection.clone();
427
+ }
428
+
429
+ if override_ref.pages.is_some() {
430
+ base_ref.pages = override_ref.pages.clone();
431
+ }
432
+
433
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
434
+ if override_ref.keywords.is_some() {
435
+ base_ref.keywords = override_ref.keywords.clone();
436
+ }
437
+
438
+ if override_ref.postprocessor.is_some() {
439
+ base_ref.postprocessor = override_ref.postprocessor.clone();
440
+ }
441
+
442
+ if override_ref.html_options.is_some() {
443
+ base_ref.html_options = override_ref.html_options.clone();
444
+ }
445
+
446
+ 1
447
+ }
448
+
449
+ /// Parse ExtractionConfig from JSON string.
450
+ ///
451
+ /// This is the core parsing logic shared by all FFI functions that deal with
452
+ /// JSON configuration. It handles:
453
+ /// - JSON deserialization
454
+ /// - All validation rules
455
+ /// - Type conversions
456
+ /// - HTML options parsing (complex nested structure)
457
+ ///
458
+ /// The error messages are user-friendly and include guidance on what went wrong.
459
+ fn parse_extraction_config_from_json(json_str: &str) -> FfiResult<ExtractionConfig> {
460
+ use html_to_markdown_rs::options::{
461
+ CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
462
+ PreprocessingPreset, WhitespaceMode,
463
+ };
464
+
465
+ // ~keep: This function performs the JSON parsing and validation that was
466
+
467
+ fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
468
+ where
469
+ F: Fn(&str) -> FfiResult<T>,
470
+ {
471
+ if let Some(raw) = value {
472
+ let text = raw
473
+ .as_str()
474
+ .ok_or_else(|| "Expected string for enum field".to_string())?;
475
+ return parse_fn(text).map(Some);
476
+ }
477
+ Ok(None)
478
+ }
479
+
480
+ fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
481
+ match value.to_lowercase().as_str() {
482
+ "atx" => Ok(HeadingStyle::Atx),
483
+ "underlined" => Ok(HeadingStyle::Underlined),
484
+ "atx_closed" => Ok(HeadingStyle::AtxClosed),
485
+ other => Err(format!(
486
+ "Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
487
+ other
488
+ )),
489
+ }
490
+ }
491
+
492
+ fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
493
+ match value.to_lowercase().as_str() {
494
+ "spaces" => Ok(ListIndentType::Spaces),
495
+ "tabs" => Ok(ListIndentType::Tabs),
496
+ other => Err(format!(
497
+ "Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
498
+ other
499
+ )),
500
+ }
501
+ }
502
+
503
+ fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
504
+ match value.to_lowercase().as_str() {
505
+ "double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
506
+ "html" => Ok(HighlightStyle::Html),
507
+ "bold" => Ok(HighlightStyle::Bold),
508
+ "none" => Ok(HighlightStyle::None),
509
+ other => Err(format!(
510
+ "Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
511
+ other
512
+ )),
513
+ }
514
+ }
515
+
516
+ fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
517
+ match value.to_lowercase().as_str() {
518
+ "normalized" => Ok(WhitespaceMode::Normalized),
519
+ "strict" => Ok(WhitespaceMode::Strict),
520
+ other => Err(format!(
521
+ "Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
522
+ other
523
+ )),
524
+ }
525
+ }
526
+
527
+ fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
528
+ match value.to_lowercase().as_str() {
529
+ "spaces" => Ok(NewlineStyle::Spaces),
530
+ "backslash" => Ok(NewlineStyle::Backslash),
531
+ other => Err(format!(
532
+ "Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
533
+ other
534
+ )),
535
+ }
536
+ }
537
+
538
+ fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
539
+ match value.to_lowercase().as_str() {
540
+ "indented" => Ok(CodeBlockStyle::Indented),
541
+ "backticks" => Ok(CodeBlockStyle::Backticks),
542
+ "tildes" => Ok(CodeBlockStyle::Tildes),
543
+ other => Err(format!(
544
+ "Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
545
+ other
546
+ )),
547
+ }
548
+ }
549
+
550
+ #[allow(dead_code)]
551
+ fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
552
+ match value.to_lowercase().as_str() {
553
+ "minimal" => Ok(PreprocessingPreset::Minimal),
554
+ "standard" => Ok(PreprocessingPreset::Standard),
555
+ "aggressive" => Ok(PreprocessingPreset::Aggressive),
556
+ other => Err(format!(
557
+ "Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
558
+ other
559
+ )),
560
+ }
561
+ }
562
+
563
+ fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
564
+ let mut opts = ConversionOptions::default();
565
+ let obj = value
566
+ .as_object()
567
+ .ok_or_else(|| "html_options must be an object".to_string())?;
568
+
569
+ if let Some(val) = obj.get("heading_style") {
570
+ opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
571
+ }
572
+
573
+ if let Some(val) = obj.get("list_indent_type") {
574
+ opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
575
+ }
576
+
577
+ if let Some(val) = obj.get("list_indent_width") {
578
+ opts.list_indent_width = val
579
+ .as_u64()
580
+ .map(|v| v as usize)
581
+ .ok_or_else(|| "list_indent_width must be an integer".to_string())?;
582
+ }
583
+
584
+ if let Some(val) = obj.get("bullets") {
585
+ opts.bullets = val
586
+ .as_str()
587
+ .map(str::to_string)
588
+ .ok_or_else(|| "bullets must be a string".to_string())?;
589
+ }
590
+
591
+ if let Some(val) = obj.get("strong_em_symbol") {
592
+ let symbol = val
593
+ .as_str()
594
+ .ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
595
+ let mut chars = symbol.chars();
596
+ opts.strong_em_symbol = chars
597
+ .next()
598
+ .ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
599
+ }
600
+
601
+ if let Some(val) = obj.get("escape_asterisks") {
602
+ opts.escape_asterisks = val
603
+ .as_bool()
604
+ .ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
605
+ }
606
+
607
+ if let Some(val) = obj.get("escape_underscores") {
608
+ opts.escape_underscores = val
609
+ .as_bool()
610
+ .ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
611
+ }
612
+
613
+ if let Some(val) = obj.get("escape_misc") {
614
+ opts.escape_misc = val
615
+ .as_bool()
616
+ .ok_or_else(|| "escape_misc must be a boolean".to_string())?;
617
+ }
618
+
619
+ if let Some(val) = obj.get("escape_ascii") {
620
+ opts.escape_ascii = val
621
+ .as_bool()
622
+ .ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
623
+ }
624
+
625
+ if let Some(val) = obj.get("code_language") {
626
+ opts.code_language = val
627
+ .as_str()
628
+ .map(str::to_string)
629
+ .ok_or_else(|| "code_language must be a string".to_string())?;
630
+ }
631
+
632
+ if let Some(val) = obj.get("autolinks") {
633
+ opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
634
+ }
635
+
636
+ if let Some(val) = obj.get("default_title") {
637
+ opts.default_title = val
638
+ .as_bool()
639
+ .ok_or_else(|| "default_title must be a boolean".to_string())?;
640
+ }
641
+
642
+ if let Some(val) = obj.get("br_in_tables") {
643
+ opts.br_in_tables = val
644
+ .as_bool()
645
+ .ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
646
+ }
647
+
648
+ if let Some(val) = obj.get("hocr_spatial_tables") {
649
+ opts.hocr_spatial_tables = val
650
+ .as_bool()
651
+ .ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
652
+ }
653
+
654
+ if let Some(val) = obj.get("highlight_style") {
655
+ opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
656
+ }
657
+
658
+ if let Some(val) = obj.get("extract_metadata") {
659
+ opts.extract_metadata = val
660
+ .as_bool()
661
+ .ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
662
+ }
663
+
664
+ if let Some(val) = obj.get("whitespace_mode") {
665
+ opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
666
+ }
667
+
668
+ if let Some(val) = obj.get("strip_newlines") {
669
+ opts.strip_newlines = val
670
+ .as_bool()
671
+ .ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
672
+ }
673
+
674
+ if let Some(val) = obj.get("wrap") {
675
+ opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
676
+ }
677
+
678
+ if let Some(val) = obj.get("wrap_width") {
679
+ opts.wrap_width = val
680
+ .as_u64()
681
+ .map(|v| v as usize)
682
+ .ok_or_else(|| "wrap_width must be an integer".to_string())?;
683
+ }
684
+
685
+ if let Some(val) = obj.get("convert_as_inline") {
686
+ opts.convert_as_inline = val
687
+ .as_bool()
688
+ .ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
689
+ }
690
+
691
+ if let Some(val) = obj.get("sub_symbol") {
692
+ opts.sub_symbol = val
693
+ .as_str()
694
+ .map(str::to_string)
695
+ .ok_or_else(|| "sub_symbol must be a string".to_string())?;
696
+ }
697
+
698
+ if let Some(val) = obj.get("sup_symbol") {
699
+ opts.sup_symbol = val
700
+ .as_str()
701
+ .map(str::to_string)
702
+ .ok_or_else(|| "sup_symbol must be a string".to_string())?;
703
+ }
704
+
705
+ if let Some(val) = obj.get("newline_style") {
706
+ opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
707
+ }
708
+
709
+ if let Some(val) = obj.get("code_block_style") {
710
+ opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
711
+ }
712
+
713
+ if let Some(val) = obj.get("keep_inline_images_in") {
714
+ opts.keep_inline_images_in = val
715
+ .as_array()
716
+ .ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
717
+ .iter()
718
+ .map(|v| {
719
+ v.as_str()
720
+ .map(str::to_string)
721
+ .ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
722
+ })
723
+ .collect::<FfiResult<Vec<_>>>()?;
724
+ }
725
+
726
+ if let Some(val) = obj.get("encoding") {
727
+ opts.encoding = val
728
+ .as_str()
729
+ .map(str::to_string)
730
+ .ok_or_else(|| "encoding must be a string".to_string())?;
731
+ }
732
+
733
+ if let Some(val) = obj.get("debug") {
734
+ opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
735
+ }
736
+
737
+ if let Some(val) = obj.get("strip_tags") {
738
+ opts.strip_tags = val
739
+ .as_array()
740
+ .ok_or_else(|| "strip_tags must be an array".to_string())?
741
+ .iter()
742
+ .map(|v| {
743
+ v.as_str()
744
+ .map(str::to_string)
745
+ .ok_or_else(|| "strip_tags entries must be strings".to_string())
746
+ })
747
+ .collect::<FfiResult<Vec<_>>>()?;
748
+ }
749
+
750
+ if let Some(val) = obj.get("preserve_tags") {
751
+ opts.preserve_tags = val
752
+ .as_array()
753
+ .ok_or_else(|| "preserve_tags must be an array".to_string())?
754
+ .iter()
755
+ .map(|v| {
756
+ v.as_str()
757
+ .map(str::to_string)
758
+ .ok_or_else(|| "preserve_tags entries must be strings".to_string())
759
+ })
760
+ .collect::<FfiResult<Vec<_>>>()?;
761
+ }
762
+
763
+ Ok(opts)
764
+ }
765
+
766
+ let json_value: serde_json::Value = serde_json::from_str(json_str).map_err(|e| format!("Invalid JSON: {}", e))?;
767
+
768
+ let mut config: ExtractionConfig =
769
+ serde_json::from_value(json_value.clone()).map_err(|e| format!("Invalid configuration structure: {}", e))?;
770
+
771
+ if let Some(html_opts_val) = json_value.get("html_options") {
772
+ config.html_options = Some(parse_html_options(html_opts_val)?);
773
+ }
774
+
775
+ Ok(config)
776
+ }
777
+
778
+ /// SerializableEmbeddingPreset for FFI serialization.
779
+ #[derive(Serialize)]
780
+ struct SerializableEmbeddingPreset<'a> {
781
+ name: &'a str,
782
+ chunk_size: usize,
783
+ overlap: usize,
784
+ model_name: String,
785
+ dimensions: usize,
786
+ description: &'a str,
787
+ }
788
+
789
+ /// Load an ExtractionConfig from a file.
790
+ ///
791
+ /// Returns a JSON string representing the loaded configuration.
792
+ ///
793
+ /// # Safety
794
+ ///
795
+ /// - `file_path` must be a valid null-terminated C string
796
+ /// - The returned string must be freed with `kreuzberg_free_string`
797
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
798
+ #[unsafe(no_mangle)]
799
+ pub unsafe extern "C" fn kreuzberg_load_extraction_config_from_file(file_path: *const c_char) -> *mut c_char {
800
+ ffi_panic_guard!("kreuzberg_load_extraction_config_from_file", {
801
+ clear_last_error();
802
+
803
+ if file_path.is_null() {
804
+ set_last_error("file_path cannot be NULL".to_string());
805
+ return ptr::null_mut();
806
+ }
807
+
808
+ let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
809
+ Ok(s) => s,
810
+ Err(e) => {
811
+ set_last_error(format!("Invalid UTF-8 in file path: {}", e));
812
+ return ptr::null_mut();
813
+ }
814
+ };
815
+
816
+ match ExtractionConfig::from_file(path_str) {
817
+ Ok(config) => match serde_json::to_string(&config) {
818
+ Ok(json) => match CString::new(json) {
819
+ Ok(cstr) => cstr.into_raw(),
820
+ Err(e) => {
821
+ set_last_error(format!("Failed to create C string: {}", e));
822
+ ptr::null_mut()
823
+ }
824
+ },
825
+ Err(e) => {
826
+ set_last_error(format!("Failed to serialize config to JSON: {}", e));
827
+ ptr::null_mut()
828
+ }
829
+ },
830
+ Err(e) => {
831
+ set_last_error(e.to_string());
832
+ ptr::null_mut()
833
+ }
834
+ }
835
+ })
836
+ }
837
+
838
+ /// Load an ExtractionConfig from a file (returns pointer to config struct).
839
+ ///
840
+ /// # Safety
841
+ ///
842
+ /// - `path` must be a valid null-terminated C string
843
+ /// - The returned pointer must be freed with `kreuzberg_config_free`
844
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
845
+ ///
846
+ /// # Example (C)
847
+ ///
848
+ /// ```c
849
+ /// ExtractionConfig* config = kreuzberg_config_from_file("config.toml");
850
+ /// if (config == NULL) {
851
+ /// printf("Error: %s\n", kreuzberg_last_error());
852
+ /// return 1;
853
+ /// }
854
+ /// kreuzberg_config_free(config);
855
+ /// ```
856
+ #[unsafe(no_mangle)]
857
+ pub unsafe extern "C" fn kreuzberg_config_from_file(path: *const c_char) -> *mut ExtractionConfig {
858
+ ffi_panic_guard!("kreuzberg_config_from_file", {
859
+ clear_last_error();
860
+
861
+ if path.is_null() {
862
+ set_last_error("Config path cannot be NULL".to_string());
863
+ return ptr::null_mut();
864
+ }
865
+
866
+ let path_str = match unsafe { CStr::from_ptr(path) }.to_str() {
867
+ Ok(s) => s,
868
+ Err(e) => {
869
+ set_last_error(format!("Invalid UTF-8 in config path: {}", e));
870
+ return ptr::null_mut();
871
+ }
872
+ };
873
+
874
+ let path_buf = Path::new(path_str);
875
+
876
+ match ExtractionConfig::from_file(path_buf) {
877
+ Ok(config) => Box::into_raw(Box::new(config)),
878
+ Err(e) => {
879
+ match &e {
880
+ KreuzbergError::Io(io_err) => {
881
+ set_last_error(format!("IO error loading config: {}", io_err));
882
+ }
883
+ _ => {
884
+ set_last_error(format!("Failed to load config from file: {}", e));
885
+ }
886
+ }
887
+ ptr::null_mut()
888
+ }
889
+ }
890
+ })
891
+ }
892
+
893
+ /// Discover and load an ExtractionConfig by searching parent directories.
894
+ ///
895
+ /// Searches the current directory and all parent directories for:
896
+ /// - `kreuzberg.toml`
897
+ /// - `kreuzberg.json`
898
+ ///
899
+ /// Returns the first config file found as a JSON string.
900
+ ///
901
+ /// # Safety
902
+ ///
903
+ /// - The returned string must be freed with `kreuzberg_free_string`
904
+ /// - Returns NULL if no config is found or on error
905
+ ///
906
+ /// # Example (C)
907
+ ///
908
+ /// ```c
909
+ /// char* config_json = kreuzberg_config_discover();
910
+ /// if (config_json != NULL) {
911
+ /// printf("Discovered config: %s\n", config_json);
912
+ /// kreuzberg_free_string(config_json);
913
+ /// }
914
+ /// ```
915
+ #[unsafe(no_mangle)]
916
+ pub unsafe extern "C" fn kreuzberg_config_discover() -> *mut c_char {
917
+ ffi_panic_guard!("kreuzberg_config_discover", {
918
+ clear_last_error();
919
+
920
+ match ExtractionConfig::discover() {
921
+ Ok(Some(config)) => match serde_json::to_string(&config) {
922
+ Ok(json) => match CString::new(json) {
923
+ Ok(cstr) => cstr.into_raw(),
924
+ Err(e) => {
925
+ set_last_error(format!("Failed to serialize config: {}", e));
926
+ ptr::null_mut()
927
+ }
928
+ },
929
+ Err(e) => {
930
+ set_last_error(format!("Failed to serialize config: {}", e));
931
+ ptr::null_mut()
932
+ }
933
+ },
934
+ Ok(None) => ptr::null_mut(),
935
+ Err(e) => {
936
+ match &e {
937
+ KreuzbergError::Io(io_err) => {
938
+ set_last_error(format!("IO error discovering config: {}", io_err));
939
+ }
940
+ _ => {
941
+ set_last_error(format!("Failed to discover config: {}", e));
942
+ }
943
+ }
944
+ ptr::null_mut()
945
+ }
946
+ }
947
+ })
948
+ }
949
+
950
+ /// List available embedding preset names.
951
+ ///
952
+ /// # Safety
953
+ ///
954
+ /// - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
955
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
956
+ #[unsafe(no_mangle)]
957
+ pub unsafe extern "C" fn kreuzberg_list_embedding_presets() -> *mut c_char {
958
+ ffi_panic_guard!("kreuzberg_list_embedding_presets", {
959
+ clear_last_error();
960
+
961
+ let presets = kreuzberg::embeddings::list_presets();
962
+ match serde_json::to_string(&presets) {
963
+ Ok(json) => match string_to_c_string(json) {
964
+ Ok(ptr) => ptr,
965
+ Err(e) => {
966
+ set_last_error(e);
967
+ ptr::null_mut()
968
+ }
969
+ },
970
+ Err(e) => {
971
+ set_last_error(format!("Failed to serialize presets: {}", e));
972
+ ptr::null_mut()
973
+ }
974
+ }
975
+ })
976
+ }
977
+
978
+ /// Get a specific embedding preset by name.
979
+ ///
980
+ /// # Safety
981
+ ///
982
+ /// - `name` must be a valid null-terminated C string
983
+ /// - Returned string is JSON object and must be freed with `kreuzberg_free_string`
984
+ /// - Returns NULL on error (check `kreuzberg_last_error`)
985
+ #[unsafe(no_mangle)]
986
+ pub unsafe extern "C" fn kreuzberg_get_embedding_preset(name: *const c_char) -> *mut c_char {
987
+ ffi_panic_guard!("kreuzberg_get_embedding_preset", {
988
+ clear_last_error();
989
+
990
+ if name.is_null() {
991
+ set_last_error("preset name cannot be NULL".to_string());
992
+ return ptr::null_mut();
993
+ }
994
+
995
+ let preset_name = match unsafe { CStr::from_ptr(name) }.to_str() {
996
+ Ok(s) => s,
997
+ Err(e) => {
998
+ set_last_error(format!("Invalid UTF-8 in preset name: {}", e));
999
+ return ptr::null_mut();
1000
+ }
1001
+ };
1002
+
1003
+ let preset = match kreuzberg::embeddings::get_preset(preset_name) {
1004
+ Some(preset) => preset,
1005
+ None => {
1006
+ set_last_error(format!("Unknown embedding preset: {}", preset_name));
1007
+ return ptr::null_mut();
1008
+ }
1009
+ };
1010
+
1011
+ let model_name = format!("{:?}", preset.model);
1012
+ let serializable = SerializableEmbeddingPreset {
1013
+ name: preset.name,
1014
+ chunk_size: preset.chunk_size,
1015
+ overlap: preset.overlap,
1016
+ model_name,
1017
+ dimensions: preset.dimensions,
1018
+ description: preset.description,
1019
+ };
1020
+
1021
+ match serde_json::to_string(&serializable) {
1022
+ Ok(json) => match string_to_c_string(json) {
1023
+ Ok(ptr) => ptr,
1024
+ Err(e) => {
1025
+ set_last_error(e);
1026
+ ptr::null_mut()
1027
+ }
1028
+ },
1029
+ Err(e) => {
1030
+ set_last_error(format!("Failed to serialize embedding preset: {}", e));
1031
+ ptr::null_mut()
1032
+ }
1033
+ }
1034
+ })
1035
+ }
1036
+
1037
+ #[cfg(test)]
1038
+ mod tests {
1039
+ use super::*;
1040
+ use std::ffi::CStr;
1041
+
1042
+ #[test]
1043
+ fn test_parse_minimal_config() {
1044
+ let json = "{}";
1045
+ let result = parse_extraction_config_from_json(json);
1046
+ assert!(result.is_ok());
1047
+ }
1048
+
1049
+ #[test]
1050
+ fn test_parse_config_with_use_cache() {
1051
+ let json = r#"{"use_cache": true}"#;
1052
+ let result = parse_extraction_config_from_json(json);
1053
+ assert!(result.is_ok());
1054
+ let config = result.unwrap();
1055
+ assert!(config.use_cache);
1056
+ }
1057
+
1058
+ #[test]
1059
+ fn test_parse_config_with_ocr() {
1060
+ let json = r#"{"ocr": {"backend": "tesseract", "language": "eng"}}"#;
1061
+ let result = parse_extraction_config_from_json(json);
1062
+ assert!(result.is_ok());
1063
+ let config = result.unwrap();
1064
+ assert!(config.ocr.is_some());
1065
+ let ocr = config.ocr.unwrap();
1066
+ assert_eq!(ocr.backend, "tesseract");
1067
+ assert_eq!(ocr.language, "eng");
1068
+ }
1069
+
1070
+ #[test]
1071
+ fn test_parse_invalid_json() {
1072
+ let json = "{invalid json}";
1073
+ let result = parse_extraction_config_from_json(json);
1074
+ assert!(result.is_err());
1075
+ }
1076
+
1077
+ #[test]
1078
+ fn test_parse_complex_config() {
1079
+ let json = r#"{
1080
+ "use_cache": true,
1081
+ "enable_quality_processing": true,
1082
+ "force_ocr": false,
1083
+ "ocr": {
1084
+ "backend": "tesseract",
1085
+ "language": "eng"
1086
+ },
1087
+ "chunking": {
1088
+ "max_chars": 1024,
1089
+ "max_overlap": 128
1090
+ },
1091
+ "max_concurrent_extractions": 4
1092
+ }"#;
1093
+ let result = parse_extraction_config_from_json(json);
1094
+ assert!(result.is_ok());
1095
+ }
1096
+
1097
+ #[test]
1098
+ fn test_config_to_json() {
1099
+ let json_str = r#"{"use_cache": true}"#;
1100
+ let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1101
+ assert!(!config_ptr.is_null());
1102
+
1103
+ let json_out = unsafe { kreuzberg_config_to_json(config_ptr) };
1104
+ assert!(!json_out.is_null());
1105
+
1106
+ let out_str = unsafe { CStr::from_ptr(json_out).to_str().unwrap() };
1107
+ assert!(out_str.contains("use_cache"));
1108
+ assert!(out_str.contains("true"));
1109
+
1110
+ unsafe {
1111
+ crate::kreuzberg_free_string(json_out);
1112
+ kreuzberg_config_free(config_ptr);
1113
+ }
1114
+ }
1115
+
1116
+ #[test]
1117
+ fn test_config_to_json_null_pointer() {
1118
+ let result = unsafe { kreuzberg_config_to_json(ptr::null()) };
1119
+ assert!(result.is_null());
1120
+ }
1121
+
1122
+ #[test]
1123
+ fn test_config_get_field_simple() {
1124
+ let json_str = r#"{"use_cache": true}"#;
1125
+ let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1126
+ assert!(!config_ptr.is_null());
1127
+
1128
+ let field_name = std::ffi::CString::new("use_cache").unwrap();
1129
+ let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
1130
+ assert!(!field_value.is_null());
1131
+
1132
+ let value_str = unsafe { CStr::from_ptr(field_value).to_str().unwrap() };
1133
+ assert_eq!(value_str, "true");
1134
+
1135
+ unsafe {
1136
+ crate::kreuzberg_free_string(field_value);
1137
+ kreuzberg_config_free(config_ptr);
1138
+ }
1139
+ }
1140
+
1141
+ #[test]
1142
+ fn test_config_get_field_nested() {
1143
+ let json_str = r#"{"ocr": {"backend": "tesseract"}}"#;
1144
+ let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1145
+ assert!(!config_ptr.is_null());
1146
+
1147
+ let field_name = std::ffi::CString::new("ocr.backend").unwrap();
1148
+ let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
1149
+ assert!(!field_value.is_null());
1150
+
1151
+ let value_str = unsafe { CStr::from_ptr(field_value).to_str().unwrap() };
1152
+ assert_eq!(value_str, r#""tesseract""#);
1153
+
1154
+ unsafe {
1155
+ crate::kreuzberg_free_string(field_value);
1156
+ kreuzberg_config_free(config_ptr);
1157
+ }
1158
+ }
1159
+
1160
+ #[test]
1161
+ fn test_config_get_field_missing() {
1162
+ let json_str = r#"{"use_cache": true}"#;
1163
+ let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1164
+ assert!(!config_ptr.is_null());
1165
+
1166
+ let field_name = std::ffi::CString::new("nonexistent").unwrap();
1167
+ let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
1168
+ assert!(field_value.is_null());
1169
+
1170
+ unsafe {
1171
+ kreuzberg_config_free(config_ptr);
1172
+ }
1173
+ }
1174
+
1175
+ #[test]
1176
+ fn test_config_get_field_null_pointer() {
1177
+ let field_name = std::ffi::CString::new("use_cache").unwrap();
1178
+ let result = unsafe { kreuzberg_config_get_field(ptr::null(), field_name.as_ptr()) };
1179
+ assert!(result.is_null());
1180
+ }
1181
+
1182
+ #[test]
1183
+ fn test_config_merge() {
1184
+ let base_json = r#"{"use_cache": true, "force_ocr": false}"#;
1185
+ let override_json = r#"{"force_ocr": true}"#;
1186
+
1187
+ let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1188
+ let override_ptr =
1189
+ unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1190
+
1191
+ assert!(!base_ptr.is_null());
1192
+ assert!(!override_ptr.is_null());
1193
+
1194
+ let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
1195
+ assert_eq!(result, 1);
1196
+
1197
+ let merged_json = unsafe { kreuzberg_config_to_json(base_ptr) };
1198
+ assert!(!merged_json.is_null());
1199
+
1200
+ let merged_str = unsafe { CStr::from_ptr(merged_json).to_str().unwrap() };
1201
+ assert!(merged_str.contains("use_cache"));
1202
+ assert!(merged_str.contains("force_ocr"));
1203
+
1204
+ unsafe {
1205
+ crate::kreuzberg_free_string(merged_json);
1206
+ kreuzberg_config_free(base_ptr);
1207
+ kreuzberg_config_free(override_ptr);
1208
+ }
1209
+ }
1210
+
1211
+ #[test]
1212
+ fn test_config_merge_null_base() {
1213
+ let override_json = r#"{"force_ocr": true}"#;
1214
+ let override_ptr =
1215
+ unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1216
+
1217
+ let result = unsafe { kreuzberg_config_merge(ptr::null_mut(), override_ptr) };
1218
+ assert_eq!(result, 0);
1219
+
1220
+ unsafe {
1221
+ kreuzberg_config_free(override_ptr);
1222
+ }
1223
+ }
1224
+
1225
+ #[test]
1226
+ fn test_config_merge_null_override() {
1227
+ let base_json = r#"{"use_cache": true}"#;
1228
+ let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1229
+
1230
+ let result = unsafe { kreuzberg_config_merge(base_ptr, ptr::null()) };
1231
+ assert_eq!(result, 0);
1232
+
1233
+ unsafe {
1234
+ kreuzberg_config_free(base_ptr);
1235
+ }
1236
+ }
1237
+
1238
+ #[test]
1239
+ fn test_config_merge_override_to_default_value() {
1240
+ let base_json = r#"{"use_cache": false}"#;
1241
+ let override_json = r#"{"use_cache": true}"#;
1242
+
1243
+ let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1244
+ let override_ptr =
1245
+ unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1246
+
1247
+ assert!(!base_ptr.is_null());
1248
+ assert!(!override_ptr.is_null());
1249
+
1250
+ let base_ref = unsafe { &*base_ptr };
1251
+ assert!(!base_ref.use_cache);
1252
+
1253
+ let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
1254
+ assert_eq!(result, 1);
1255
+
1256
+ let base_ref = unsafe { &*base_ptr };
1257
+ assert!(base_ref.use_cache, "override to default value should be applied");
1258
+
1259
+ unsafe {
1260
+ kreuzberg_config_free(base_ptr);
1261
+ kreuzberg_config_free(override_ptr);
1262
+ }
1263
+ }
1264
+
1265
+ #[test]
1266
+ fn test_config_merge_override_force_ocr() {
1267
+ let base_json = r#"{"force_ocr": false}"#;
1268
+ let override_json = r#"{"force_ocr": true}"#;
1269
+
1270
+ let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1271
+ let override_ptr =
1272
+ unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1273
+
1274
+ assert!(!base_ptr.is_null());
1275
+ assert!(!override_ptr.is_null());
1276
+
1277
+ let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
1278
+ assert_eq!(result, 1);
1279
+
1280
+ let base_ref = unsafe { &*base_ptr };
1281
+ assert!(base_ref.force_ocr);
1282
+
1283
+ unsafe {
1284
+ kreuzberg_config_free(base_ptr);
1285
+ kreuzberg_config_free(override_ptr);
1286
+ }
1287
+ }
1288
+
1289
+ #[test]
1290
+ fn test_list_embedding_presets() {
1291
+ let result = unsafe { kreuzberg_list_embedding_presets() };
1292
+ assert!(!result.is_null());
1293
+
1294
+ let presets_str = unsafe { CStr::from_ptr(result).to_str().unwrap() };
1295
+ assert!(presets_str.starts_with('['));
1296
+ assert!(presets_str.ends_with(']'));
1297
+
1298
+ unsafe {
1299
+ crate::kreuzberg_free_string(result);
1300
+ }
1301
+ }
1302
+
1303
+ #[test]
1304
+ fn test_get_embedding_preset_null() {
1305
+ let result = unsafe { kreuzberg_get_embedding_preset(ptr::null()) };
1306
+ assert!(result.is_null());
1307
+ }
1308
+
1309
+ #[test]
1310
+ fn test_get_embedding_preset_unknown() {
1311
+ let name = CString::new("nonexistent_preset").unwrap();
1312
+ let result = unsafe { kreuzberg_get_embedding_preset(name.as_ptr()) };
1313
+ assert!(result.is_null());
1314
+ }
1315
+
1316
+ #[test]
1317
+ fn test_get_embedding_preset_valid() {
1318
+ let name = CString::new("fast").unwrap();
1319
+ let result = unsafe { kreuzberg_get_embedding_preset(name.as_ptr()) };
1320
+ assert!(!result.is_null());
1321
+
1322
+ let preset_str = unsafe { CStr::from_ptr(result).to_str().unwrap() };
1323
+ assert!(preset_str.contains("name"));
1324
+ assert!(preset_str.contains("chunk_size"));
1325
+
1326
+ unsafe {
1327
+ crate::kreuzberg_free_string(result);
1328
+ }
1329
+ }
1330
+
1331
+ #[test]
1332
+ fn test_config_discover_null_safe() {
1333
+ let result = unsafe { kreuzberg_config_discover() };
1334
+ // Result can be null if no config found, which is valid
1335
+ if !result.is_null() {
1336
+ unsafe {
1337
+ crate::kreuzberg_free_string(result);
1338
+ }
1339
+ }
1340
+ }
1341
+ }