kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,1341 +0,0 @@
1
- //! Centralized FFI configuration parsing module.
2
- //!
3
- //! This module consolidates all configuration parsing logic that was previously
4
- //! duplicated across all language bindings (Python, TypeScript, Ruby, Java, Go, C#).
5
- //!
6
- //! Instead of each binding reimplementing config parsing from JSON, they now
7
- //! call the FFI functions provided here, ensuring:
8
- //! - Single source of truth for validation rules
9
- //! - Consistent behavior across all languages
10
- //! - Elimination of drift/inconsistencies
11
- //! - Better performance (no JSON round-trips in language bindings)
12
-
13
- use crate::ffi_panic_guard;
14
- use crate::helpers::{clear_last_error, set_last_error, string_to_c_string};
15
- use kreuzberg::KreuzbergError;
16
- use kreuzberg::core::config::ExtractionConfig;
17
- use serde::Serialize;
18
- use std::ffi::{CStr, CString};
19
- use std::os::raw::c_char;
20
- use std::path::Path;
21
- use std::ptr;
22
-
23
- type FfiResult<T> = std::result::Result<T, String>;
24
-
25
- /// Parse an ExtractionConfig from a JSON string.
26
- ///
27
- /// This is the primary FFI entry point for all language bindings to parse
28
- /// configuration from JSON. Replaces the need for each binding to implement
29
- /// its own JSON parsing logic.
30
- ///
31
- /// # Arguments
32
- ///
33
- /// * `json_config` - Null-terminated C string containing JSON configuration
34
- ///
35
- /// # Returns
36
- ///
37
- /// A pointer to an ExtractionConfig struct that MUST be freed with
38
- /// `kreuzberg_config_free`, or NULL on error (check kreuzberg_last_error).
39
- ///
40
- /// # Safety
41
- ///
42
- /// - `json_config` must be a valid null-terminated C string
43
- /// - The returned pointer must be freed with `kreuzberg_config_free`
44
- /// - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
45
- ///
46
- /// # Example (C)
47
- ///
48
- /// ```c
49
- /// const char* config_json = "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}";
50
- /// ExtractionConfig* config = kreuzberg_config_from_json(config_json);
51
- /// if (config == NULL) {
52
- /// printf("Error: %s\n", kreuzberg_last_error());
53
- /// return 1;
54
- /// }
55
- ///
56
- /// // Use config...
57
- /// // char* result = kreuzberg_extract_file_with_config("doc.pdf", config);
58
- ///
59
- /// kreuzberg_config_free(config);
60
- /// ```
61
- #[unsafe(no_mangle)]
62
- pub unsafe extern "C" fn kreuzberg_config_from_json(json_config: *const c_char) -> *mut ExtractionConfig {
63
- if json_config.is_null() {
64
- set_last_error("Config JSON cannot be NULL".to_string());
65
- return ptr::null_mut();
66
- }
67
-
68
- clear_last_error();
69
-
70
- let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
71
- Ok(s) => s,
72
- Err(e) => {
73
- set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
74
- return ptr::null_mut();
75
- }
76
- };
77
-
78
- match parse_extraction_config_from_json(json_str) {
79
- Ok(config) => Box::into_raw(Box::new(config)),
80
- Err(e) => {
81
- set_last_error(e);
82
- ptr::null_mut()
83
- }
84
- }
85
- }
86
-
87
- /// Free an ExtractionConfig allocated by kreuzberg_config_from_json or similar.
88
- ///
89
- /// # Safety
90
- ///
91
- /// - `config` must be a pointer previously returned by a config creation function
92
- /// - `config` can be NULL (no-op)
93
- /// - `config` must not be used after this call
94
- ///
95
- /// # Example (C)
96
- ///
97
- /// ```c
98
- /// ExtractionConfig* config = kreuzberg_config_from_json("{...}");
99
- /// if (config != NULL) {
100
- /// // Use config...
101
- /// kreuzberg_config_free(config);
102
- /// }
103
- /// ```
104
- #[unsafe(no_mangle)]
105
- pub unsafe extern "C" fn kreuzberg_config_free(config: *mut ExtractionConfig) {
106
- if !config.is_null() {
107
- let _ = unsafe { Box::from_raw(config) };
108
- }
109
- }
110
-
111
- /// Validate a JSON config string without parsing it.
112
- ///
113
- /// This function checks if a JSON config string is valid and would parse correctly,
114
- /// without allocating the full ExtractionConfig structure. Useful for validation
115
- /// before committing to parsing.
116
- ///
117
- /// # Arguments
118
- ///
119
- /// * `json_config` - Null-terminated C string containing JSON configuration
120
- ///
121
- /// # Returns
122
- ///
123
- /// - 1 if valid (would parse successfully)
124
- /// - 0 if invalid (check `kreuzberg_last_error` for details)
125
- ///
126
- /// # Safety
127
- ///
128
- /// - `json_config` must be a valid null-terminated C string
129
- ///
130
- /// # Example (C)
131
- ///
132
- /// ```c
133
- /// const char* config_json = "{\"use_cache\": true}";
134
- /// if (kreuzberg_config_is_valid(config_json)) {
135
- /// ExtractionConfig* config = kreuzberg_config_from_json(config_json);
136
- /// // Use config...
137
- /// kreuzberg_config_free(config);
138
- /// } else {
139
- /// printf("Invalid config: %s\n", kreuzberg_last_error());
140
- /// }
141
- /// ```
142
- #[unsafe(no_mangle)]
143
- pub unsafe extern "C" fn kreuzberg_config_is_valid(json_config: *const c_char) -> i32 {
144
- if json_config.is_null() {
145
- set_last_error("Config JSON cannot be NULL".to_string());
146
- return 0;
147
- }
148
-
149
- clear_last_error();
150
-
151
- let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
152
- Ok(s) => s,
153
- Err(e) => {
154
- set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
155
- return 0;
156
- }
157
- };
158
-
159
- match parse_extraction_config_from_json(json_str) {
160
- Ok(_) => 1,
161
- Err(e) => {
162
- set_last_error(e);
163
- 0
164
- }
165
- }
166
- }
167
-
168
- /// Serialize an ExtractionConfig to JSON string.
169
- ///
170
- /// Converts an ExtractionConfig structure to its JSON representation, allowing
171
- /// bindings to serialize configs without reimplementing serialization logic.
172
- ///
173
- /// # Arguments
174
- ///
175
- /// * `config` - Pointer to an ExtractionConfig structure
176
- ///
177
- /// # Returns
178
- ///
179
- /// A pointer to a C string containing JSON that MUST be freed with `kreuzberg_free_string`.
180
- /// Returns NULL on error (check `kreuzberg_last_error`).
181
- ///
182
- /// # Safety
183
- ///
184
- /// - `config` must be a valid pointer to an ExtractionConfig
185
- /// - `config` cannot be NULL
186
- /// - The returned pointer must be freed with `kreuzberg_free_string`
187
- ///
188
- /// # Example (C)
189
- ///
190
- /// ```c
191
- /// ExtractionConfig* config = kreuzberg_config_from_json("{\"use_cache\": true}");
192
- /// if (config != NULL) {
193
- /// char* json = kreuzberg_config_to_json(config);
194
- /// if (json != NULL) {
195
- /// printf("Serialized: %s\n", json);
196
- /// kreuzberg_free_string(json);
197
- /// }
198
- /// kreuzberg_config_free(config);
199
- /// }
200
- /// ```
201
- #[unsafe(no_mangle)]
202
- pub unsafe extern "C" fn kreuzberg_config_to_json(config: *const ExtractionConfig) -> *mut c_char {
203
- if config.is_null() {
204
- set_last_error("Config cannot be NULL".to_string());
205
- return ptr::null_mut();
206
- }
207
-
208
- clear_last_error();
209
-
210
- match serde_json::to_string(unsafe { &*config }) {
211
- Ok(json) => match std::ffi::CString::new(json) {
212
- Ok(c_string) => c_string.into_raw(),
213
- Err(e) => {
214
- set_last_error(format!("Failed to convert JSON to C string: {}", e));
215
- ptr::null_mut()
216
- }
217
- },
218
- Err(e) => {
219
- set_last_error(format!("Failed to serialize config to JSON: {}", e));
220
- ptr::null_mut()
221
- }
222
- }
223
- }
224
-
225
- /// Get a specific field from config as JSON string.
226
- ///
227
- /// Retrieves a nested field from the configuration by path and returns its JSON
228
- /// representation. Supports dot notation for nested fields (e.g., "ocr.backend").
229
- ///
230
- /// # Arguments
231
- ///
232
- /// * `config` - Pointer to an ExtractionConfig structure
233
- /// * `field_name` - Null-terminated C string with field path (e.g., "use_cache", "ocr.backend")
234
- ///
235
- /// # Returns
236
- ///
237
- /// A pointer to a C string containing the field value as JSON, or NULL if:
238
- /// - The field doesn't exist
239
- /// - An error occurs during serialization
240
- ///
241
- /// The returned pointer (if non-NULL) must be freed with `kreuzberg_free_string`.
242
- ///
243
- /// # Safety
244
- ///
245
- /// - `config` must be a valid pointer to an ExtractionConfig
246
- /// - `field_name` must be a valid null-terminated C string
247
- /// - Neither parameter can be NULL
248
- ///
249
- /// # Example (C)
250
- ///
251
- /// ```c
252
- /// ExtractionConfig* config = kreuzberg_config_from_json(
253
- /// "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}"
254
- /// );
255
- /// if (config != NULL) {
256
- /// char* use_cache = kreuzberg_config_get_field(config, "use_cache");
257
- /// char* backend = kreuzberg_config_get_field(config, "ocr.backend");
258
- ///
259
- /// if (use_cache != NULL) {
260
- /// printf("use_cache: %s\n", use_cache);
261
- /// kreuzberg_free_string(use_cache);
262
- /// }
263
- ///
264
- /// if (backend != NULL) {
265
- /// printf("backend: %s\n", backend);
266
- /// kreuzberg_free_string(backend);
267
- /// }
268
- ///
269
- /// kreuzberg_config_free(config);
270
- /// }
271
- /// ```
272
- #[unsafe(no_mangle)]
273
- pub unsafe extern "C" fn kreuzberg_config_get_field(
274
- config: *const ExtractionConfig,
275
- field_name: *const c_char,
276
- ) -> *mut c_char {
277
- if config.is_null() {
278
- set_last_error("Config cannot be NULL".to_string());
279
- return ptr::null_mut();
280
- }
281
-
282
- if field_name.is_null() {
283
- set_last_error("Field name cannot be NULL".to_string());
284
- return ptr::null_mut();
285
- }
286
-
287
- clear_last_error();
288
-
289
- let field_str = match unsafe { CStr::from_ptr(field_name) }.to_str() {
290
- Ok(s) => s,
291
- Err(e) => {
292
- set_last_error(format!("Invalid UTF-8 in field name: {}", e));
293
- return ptr::null_mut();
294
- }
295
- };
296
-
297
- let json_value = match serde_json::to_value(unsafe { &*config }) {
298
- Ok(val) => val,
299
- Err(e) => {
300
- set_last_error(format!("Failed to serialize config: {}", e));
301
- return ptr::null_mut();
302
- }
303
- };
304
-
305
- let mut current = &json_value;
306
- for part in field_str.split('.') {
307
- if let Some(obj) = current.as_object() {
308
- match obj.get(part) {
309
- Some(val) => current = val,
310
- None => {
311
- set_last_error(format!("Field '{}' not found in config", field_str));
312
- return ptr::null_mut();
313
- }
314
- }
315
- } else {
316
- set_last_error(format!("Cannot access nested field '{}' in non-object", part));
317
- return ptr::null_mut();
318
- }
319
- }
320
-
321
- match serde_json::to_string(current) {
322
- Ok(json) => match std::ffi::CString::new(json) {
323
- Ok(c_string) => c_string.into_raw(),
324
- Err(e) => {
325
- set_last_error(format!("Failed to convert field value to C string: {}", e));
326
- ptr::null_mut()
327
- }
328
- },
329
- Err(e) => {
330
- set_last_error(format!("Failed to serialize field value: {}", e));
331
- ptr::null_mut()
332
- }
333
- }
334
- }
335
-
336
- /// Merge two configs (override takes precedence over base).
337
- ///
338
- /// Performs a shallow merge of two ExtractionConfig structures, where fields
339
- /// from `override_config` take precedence over fields in `base`. The `base`
340
- /// config is modified in-place.
341
- ///
342
- /// # Arguments
343
- ///
344
- /// * `base` - Pointer to the base ExtractionConfig (will be modified)
345
- /// * `override_config` - Pointer to the override ExtractionConfig (read-only)
346
- ///
347
- /// # Returns
348
- ///
349
- /// - 1 on success
350
- /// - 0 on error (check `kreuzberg_last_error`)
351
- ///
352
- /// # Safety
353
- ///
354
- /// - `base` must be a valid mutable pointer to an ExtractionConfig
355
- /// - `override_config` must be a valid pointer to an ExtractionConfig
356
- /// - Neither parameter can be NULL
357
- /// - `base` is modified in-place
358
- ///
359
- /// # Example (C)
360
- ///
361
- /// ```c
362
- /// ExtractionConfig* base = kreuzberg_config_from_json(
363
- /// "{\"use_cache\": true, \"force_ocr\": false}"
364
- /// );
365
- /// ExtractionConfig* override = kreuzberg_config_from_json(
366
- /// "{\"force_ocr\": true}"
367
- /// );
368
- ///
369
- /// if (kreuzberg_config_merge(base, override) == 1) {
370
- /// // base now has: use_cache=true, force_ocr=true
371
- /// char* json = kreuzberg_config_to_json(base);
372
- /// printf("Merged config: %s\n", json);
373
- /// kreuzberg_free_string(json);
374
- /// }
375
- ///
376
- /// kreuzberg_config_free(base);
377
- /// kreuzberg_config_free(override);
378
- /// ```
379
- #[unsafe(no_mangle)]
380
- pub unsafe extern "C" fn kreuzberg_config_merge(
381
- base: *mut ExtractionConfig,
382
- override_config: *const ExtractionConfig,
383
- ) -> i32 {
384
- if base.is_null() {
385
- set_last_error("Base config cannot be NULL".to_string());
386
- return 0;
387
- }
388
-
389
- if override_config.is_null() {
390
- set_last_error("Override config cannot be NULL".to_string());
391
- return 0;
392
- }
393
-
394
- clear_last_error();
395
-
396
- let base_ref = unsafe { &mut *base };
397
- let override_ref = unsafe { &*override_config };
398
-
399
- base_ref.use_cache = override_ref.use_cache;
400
- base_ref.enable_quality_processing = override_ref.enable_quality_processing;
401
- base_ref.force_ocr = override_ref.force_ocr;
402
- base_ref.max_concurrent_extractions = override_ref.max_concurrent_extractions;
403
-
404
- if override_ref.ocr.is_some() {
405
- base_ref.ocr = override_ref.ocr.clone();
406
- }
407
-
408
- if override_ref.chunking.is_some() {
409
- base_ref.chunking = override_ref.chunking.clone();
410
- }
411
-
412
- if override_ref.images.is_some() {
413
- base_ref.images = override_ref.images.clone();
414
- }
415
-
416
- #[cfg(feature = "pdf")]
417
- if override_ref.pdf_options.is_some() {
418
- base_ref.pdf_options = override_ref.pdf_options.clone();
419
- }
420
-
421
- if override_ref.token_reduction.is_some() {
422
- base_ref.token_reduction = override_ref.token_reduction.clone();
423
- }
424
-
425
- if override_ref.language_detection.is_some() {
426
- base_ref.language_detection = override_ref.language_detection.clone();
427
- }
428
-
429
- if override_ref.pages.is_some() {
430
- base_ref.pages = override_ref.pages.clone();
431
- }
432
-
433
- #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
434
- if override_ref.keywords.is_some() {
435
- base_ref.keywords = override_ref.keywords.clone();
436
- }
437
-
438
- if override_ref.postprocessor.is_some() {
439
- base_ref.postprocessor = override_ref.postprocessor.clone();
440
- }
441
-
442
- if override_ref.html_options.is_some() {
443
- base_ref.html_options = override_ref.html_options.clone();
444
- }
445
-
446
- 1
447
- }
448
-
449
- /// Parse ExtractionConfig from JSON string.
450
- ///
451
- /// This is the core parsing logic shared by all FFI functions that deal with
452
- /// JSON configuration. It handles:
453
- /// - JSON deserialization
454
- /// - All validation rules
455
- /// - Type conversions
456
- /// - HTML options parsing (complex nested structure)
457
- ///
458
- /// The error messages are user-friendly and include guidance on what went wrong.
459
- fn parse_extraction_config_from_json(json_str: &str) -> FfiResult<ExtractionConfig> {
460
- use html_to_markdown_rs::options::{
461
- CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
462
- PreprocessingPreset, WhitespaceMode,
463
- };
464
-
465
- // ~keep: This function performs the JSON parsing and validation that was
466
-
467
- fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
468
- where
469
- F: Fn(&str) -> FfiResult<T>,
470
- {
471
- if let Some(raw) = value {
472
- let text = raw
473
- .as_str()
474
- .ok_or_else(|| "Expected string for enum field".to_string())?;
475
- return parse_fn(text).map(Some);
476
- }
477
- Ok(None)
478
- }
479
-
480
- fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
481
- match value.to_lowercase().as_str() {
482
- "atx" => Ok(HeadingStyle::Atx),
483
- "underlined" => Ok(HeadingStyle::Underlined),
484
- "atx_closed" => Ok(HeadingStyle::AtxClosed),
485
- other => Err(format!(
486
- "Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
487
- other
488
- )),
489
- }
490
- }
491
-
492
- fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
493
- match value.to_lowercase().as_str() {
494
- "spaces" => Ok(ListIndentType::Spaces),
495
- "tabs" => Ok(ListIndentType::Tabs),
496
- other => Err(format!(
497
- "Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
498
- other
499
- )),
500
- }
501
- }
502
-
503
- fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
504
- match value.to_lowercase().as_str() {
505
- "double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
506
- "html" => Ok(HighlightStyle::Html),
507
- "bold" => Ok(HighlightStyle::Bold),
508
- "none" => Ok(HighlightStyle::None),
509
- other => Err(format!(
510
- "Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
511
- other
512
- )),
513
- }
514
- }
515
-
516
- fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
517
- match value.to_lowercase().as_str() {
518
- "normalized" => Ok(WhitespaceMode::Normalized),
519
- "strict" => Ok(WhitespaceMode::Strict),
520
- other => Err(format!(
521
- "Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
522
- other
523
- )),
524
- }
525
- }
526
-
527
- fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
528
- match value.to_lowercase().as_str() {
529
- "spaces" => Ok(NewlineStyle::Spaces),
530
- "backslash" => Ok(NewlineStyle::Backslash),
531
- other => Err(format!(
532
- "Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
533
- other
534
- )),
535
- }
536
- }
537
-
538
- fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
539
- match value.to_lowercase().as_str() {
540
- "indented" => Ok(CodeBlockStyle::Indented),
541
- "backticks" => Ok(CodeBlockStyle::Backticks),
542
- "tildes" => Ok(CodeBlockStyle::Tildes),
543
- other => Err(format!(
544
- "Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
545
- other
546
- )),
547
- }
548
- }
549
-
550
- #[allow(dead_code)]
551
- fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
552
- match value.to_lowercase().as_str() {
553
- "minimal" => Ok(PreprocessingPreset::Minimal),
554
- "standard" => Ok(PreprocessingPreset::Standard),
555
- "aggressive" => Ok(PreprocessingPreset::Aggressive),
556
- other => Err(format!(
557
- "Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
558
- other
559
- )),
560
- }
561
- }
562
-
563
- fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
564
- let mut opts = ConversionOptions::default();
565
- let obj = value
566
- .as_object()
567
- .ok_or_else(|| "html_options must be an object".to_string())?;
568
-
569
- if let Some(val) = obj.get("heading_style") {
570
- opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
571
- }
572
-
573
- if let Some(val) = obj.get("list_indent_type") {
574
- opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
575
- }
576
-
577
- if let Some(val) = obj.get("list_indent_width") {
578
- opts.list_indent_width = val
579
- .as_u64()
580
- .map(|v| v as usize)
581
- .ok_or_else(|| "list_indent_width must be an integer".to_string())?;
582
- }
583
-
584
- if let Some(val) = obj.get("bullets") {
585
- opts.bullets = val
586
- .as_str()
587
- .map(str::to_string)
588
- .ok_or_else(|| "bullets must be a string".to_string())?;
589
- }
590
-
591
- if let Some(val) = obj.get("strong_em_symbol") {
592
- let symbol = val
593
- .as_str()
594
- .ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
595
- let mut chars = symbol.chars();
596
- opts.strong_em_symbol = chars
597
- .next()
598
- .ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
599
- }
600
-
601
- if let Some(val) = obj.get("escape_asterisks") {
602
- opts.escape_asterisks = val
603
- .as_bool()
604
- .ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
605
- }
606
-
607
- if let Some(val) = obj.get("escape_underscores") {
608
- opts.escape_underscores = val
609
- .as_bool()
610
- .ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
611
- }
612
-
613
- if let Some(val) = obj.get("escape_misc") {
614
- opts.escape_misc = val
615
- .as_bool()
616
- .ok_or_else(|| "escape_misc must be a boolean".to_string())?;
617
- }
618
-
619
- if let Some(val) = obj.get("escape_ascii") {
620
- opts.escape_ascii = val
621
- .as_bool()
622
- .ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
623
- }
624
-
625
- if let Some(val) = obj.get("code_language") {
626
- opts.code_language = val
627
- .as_str()
628
- .map(str::to_string)
629
- .ok_or_else(|| "code_language must be a string".to_string())?;
630
- }
631
-
632
- if let Some(val) = obj.get("autolinks") {
633
- opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
634
- }
635
-
636
- if let Some(val) = obj.get("default_title") {
637
- opts.default_title = val
638
- .as_bool()
639
- .ok_or_else(|| "default_title must be a boolean".to_string())?;
640
- }
641
-
642
- if let Some(val) = obj.get("br_in_tables") {
643
- opts.br_in_tables = val
644
- .as_bool()
645
- .ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
646
- }
647
-
648
- if let Some(val) = obj.get("hocr_spatial_tables") {
649
- opts.hocr_spatial_tables = val
650
- .as_bool()
651
- .ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
652
- }
653
-
654
- if let Some(val) = obj.get("highlight_style") {
655
- opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
656
- }
657
-
658
- if let Some(val) = obj.get("extract_metadata") {
659
- opts.extract_metadata = val
660
- .as_bool()
661
- .ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
662
- }
663
-
664
- if let Some(val) = obj.get("whitespace_mode") {
665
- opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
666
- }
667
-
668
- if let Some(val) = obj.get("strip_newlines") {
669
- opts.strip_newlines = val
670
- .as_bool()
671
- .ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
672
- }
673
-
674
- if let Some(val) = obj.get("wrap") {
675
- opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
676
- }
677
-
678
- if let Some(val) = obj.get("wrap_width") {
679
- opts.wrap_width = val
680
- .as_u64()
681
- .map(|v| v as usize)
682
- .ok_or_else(|| "wrap_width must be an integer".to_string())?;
683
- }
684
-
685
- if let Some(val) = obj.get("convert_as_inline") {
686
- opts.convert_as_inline = val
687
- .as_bool()
688
- .ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
689
- }
690
-
691
- if let Some(val) = obj.get("sub_symbol") {
692
- opts.sub_symbol = val
693
- .as_str()
694
- .map(str::to_string)
695
- .ok_or_else(|| "sub_symbol must be a string".to_string())?;
696
- }
697
-
698
- if let Some(val) = obj.get("sup_symbol") {
699
- opts.sup_symbol = val
700
- .as_str()
701
- .map(str::to_string)
702
- .ok_or_else(|| "sup_symbol must be a string".to_string())?;
703
- }
704
-
705
- if let Some(val) = obj.get("newline_style") {
706
- opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
707
- }
708
-
709
- if let Some(val) = obj.get("code_block_style") {
710
- opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
711
- }
712
-
713
- if let Some(val) = obj.get("keep_inline_images_in") {
714
- opts.keep_inline_images_in = val
715
- .as_array()
716
- .ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
717
- .iter()
718
- .map(|v| {
719
- v.as_str()
720
- .map(str::to_string)
721
- .ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
722
- })
723
- .collect::<FfiResult<Vec<_>>>()?;
724
- }
725
-
726
- if let Some(val) = obj.get("encoding") {
727
- opts.encoding = val
728
- .as_str()
729
- .map(str::to_string)
730
- .ok_or_else(|| "encoding must be a string".to_string())?;
731
- }
732
-
733
- if let Some(val) = obj.get("debug") {
734
- opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
735
- }
736
-
737
- if let Some(val) = obj.get("strip_tags") {
738
- opts.strip_tags = val
739
- .as_array()
740
- .ok_or_else(|| "strip_tags must be an array".to_string())?
741
- .iter()
742
- .map(|v| {
743
- v.as_str()
744
- .map(str::to_string)
745
- .ok_or_else(|| "strip_tags entries must be strings".to_string())
746
- })
747
- .collect::<FfiResult<Vec<_>>>()?;
748
- }
749
-
750
- if let Some(val) = obj.get("preserve_tags") {
751
- opts.preserve_tags = val
752
- .as_array()
753
- .ok_or_else(|| "preserve_tags must be an array".to_string())?
754
- .iter()
755
- .map(|v| {
756
- v.as_str()
757
- .map(str::to_string)
758
- .ok_or_else(|| "preserve_tags entries must be strings".to_string())
759
- })
760
- .collect::<FfiResult<Vec<_>>>()?;
761
- }
762
-
763
- Ok(opts)
764
- }
765
-
766
- let json_value: serde_json::Value = serde_json::from_str(json_str).map_err(|e| format!("Invalid JSON: {}", e))?;
767
-
768
- let mut config: ExtractionConfig =
769
- serde_json::from_value(json_value.clone()).map_err(|e| format!("Invalid configuration structure: {}", e))?;
770
-
771
- if let Some(html_opts_val) = json_value.get("html_options") {
772
- config.html_options = Some(parse_html_options(html_opts_val)?);
773
- }
774
-
775
- Ok(config)
776
- }
777
-
778
- /// SerializableEmbeddingPreset for FFI serialization.
779
- #[derive(Serialize)]
780
- struct SerializableEmbeddingPreset<'a> {
781
- name: &'a str,
782
- chunk_size: usize,
783
- overlap: usize,
784
- model_name: String,
785
- dimensions: usize,
786
- description: &'a str,
787
- }
788
-
789
- /// Load an ExtractionConfig from a file.
790
- ///
791
- /// Returns a JSON string representing the loaded configuration.
792
- ///
793
- /// # Safety
794
- ///
795
- /// - `file_path` must be a valid null-terminated C string
796
- /// - The returned string must be freed with `kreuzberg_free_string`
797
- /// - Returns NULL on error (check `kreuzberg_last_error`)
798
- #[unsafe(no_mangle)]
799
- pub unsafe extern "C" fn kreuzberg_load_extraction_config_from_file(file_path: *const c_char) -> *mut c_char {
800
- ffi_panic_guard!("kreuzberg_load_extraction_config_from_file", {
801
- clear_last_error();
802
-
803
- if file_path.is_null() {
804
- set_last_error("file_path cannot be NULL".to_string());
805
- return ptr::null_mut();
806
- }
807
-
808
- let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
809
- Ok(s) => s,
810
- Err(e) => {
811
- set_last_error(format!("Invalid UTF-8 in file path: {}", e));
812
- return ptr::null_mut();
813
- }
814
- };
815
-
816
- match ExtractionConfig::from_file(path_str) {
817
- Ok(config) => match serde_json::to_string(&config) {
818
- Ok(json) => match CString::new(json) {
819
- Ok(cstr) => cstr.into_raw(),
820
- Err(e) => {
821
- set_last_error(format!("Failed to create C string: {}", e));
822
- ptr::null_mut()
823
- }
824
- },
825
- Err(e) => {
826
- set_last_error(format!("Failed to serialize config to JSON: {}", e));
827
- ptr::null_mut()
828
- }
829
- },
830
- Err(e) => {
831
- set_last_error(e.to_string());
832
- ptr::null_mut()
833
- }
834
- }
835
- })
836
- }
837
-
838
- /// Load an ExtractionConfig from a file (returns pointer to config struct).
839
- ///
840
- /// # Safety
841
- ///
842
- /// - `path` must be a valid null-terminated C string
843
- /// - The returned pointer must be freed with `kreuzberg_config_free`
844
- /// - Returns NULL on error (check `kreuzberg_last_error`)
845
- ///
846
- /// # Example (C)
847
- ///
848
- /// ```c
849
- /// ExtractionConfig* config = kreuzberg_config_from_file("config.toml");
850
- /// if (config == NULL) {
851
- /// printf("Error: %s\n", kreuzberg_last_error());
852
- /// return 1;
853
- /// }
854
- /// kreuzberg_config_free(config);
855
- /// ```
856
- #[unsafe(no_mangle)]
857
- pub unsafe extern "C" fn kreuzberg_config_from_file(path: *const c_char) -> *mut ExtractionConfig {
858
- ffi_panic_guard!("kreuzberg_config_from_file", {
859
- clear_last_error();
860
-
861
- if path.is_null() {
862
- set_last_error("Config path cannot be NULL".to_string());
863
- return ptr::null_mut();
864
- }
865
-
866
- let path_str = match unsafe { CStr::from_ptr(path) }.to_str() {
867
- Ok(s) => s,
868
- Err(e) => {
869
- set_last_error(format!("Invalid UTF-8 in config path: {}", e));
870
- return ptr::null_mut();
871
- }
872
- };
873
-
874
- let path_buf = Path::new(path_str);
875
-
876
- match ExtractionConfig::from_file(path_buf) {
877
- Ok(config) => Box::into_raw(Box::new(config)),
878
- Err(e) => {
879
- match &e {
880
- KreuzbergError::Io(io_err) => {
881
- set_last_error(format!("IO error loading config: {}", io_err));
882
- }
883
- _ => {
884
- set_last_error(format!("Failed to load config from file: {}", e));
885
- }
886
- }
887
- ptr::null_mut()
888
- }
889
- }
890
- })
891
- }
892
-
893
- /// Discover and load an ExtractionConfig by searching parent directories.
894
- ///
895
- /// Searches the current directory and all parent directories for:
896
- /// - `kreuzberg.toml`
897
- /// - `kreuzberg.json`
898
- ///
899
- /// Returns the first config file found as a JSON string.
900
- ///
901
- /// # Safety
902
- ///
903
- /// - The returned string must be freed with `kreuzberg_free_string`
904
- /// - Returns NULL if no config is found or on error
905
- ///
906
- /// # Example (C)
907
- ///
908
- /// ```c
909
- /// char* config_json = kreuzberg_config_discover();
910
- /// if (config_json != NULL) {
911
- /// printf("Discovered config: %s\n", config_json);
912
- /// kreuzberg_free_string(config_json);
913
- /// }
914
- /// ```
915
- #[unsafe(no_mangle)]
916
- pub unsafe extern "C" fn kreuzberg_config_discover() -> *mut c_char {
917
- ffi_panic_guard!("kreuzberg_config_discover", {
918
- clear_last_error();
919
-
920
- match ExtractionConfig::discover() {
921
- Ok(Some(config)) => match serde_json::to_string(&config) {
922
- Ok(json) => match CString::new(json) {
923
- Ok(cstr) => cstr.into_raw(),
924
- Err(e) => {
925
- set_last_error(format!("Failed to serialize config: {}", e));
926
- ptr::null_mut()
927
- }
928
- },
929
- Err(e) => {
930
- set_last_error(format!("Failed to serialize config: {}", e));
931
- ptr::null_mut()
932
- }
933
- },
934
- Ok(None) => ptr::null_mut(),
935
- Err(e) => {
936
- match &e {
937
- KreuzbergError::Io(io_err) => {
938
- set_last_error(format!("IO error discovering config: {}", io_err));
939
- }
940
- _ => {
941
- set_last_error(format!("Failed to discover config: {}", e));
942
- }
943
- }
944
- ptr::null_mut()
945
- }
946
- }
947
- })
948
- }
949
-
950
- /// List available embedding preset names.
951
- ///
952
- /// # Safety
953
- ///
954
- /// - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
955
- /// - Returns NULL on error (check `kreuzberg_last_error`)
956
- #[unsafe(no_mangle)]
957
- pub unsafe extern "C" fn kreuzberg_list_embedding_presets() -> *mut c_char {
958
- ffi_panic_guard!("kreuzberg_list_embedding_presets", {
959
- clear_last_error();
960
-
961
- let presets = kreuzberg::embeddings::list_presets();
962
- match serde_json::to_string(&presets) {
963
- Ok(json) => match string_to_c_string(json) {
964
- Ok(ptr) => ptr,
965
- Err(e) => {
966
- set_last_error(e);
967
- ptr::null_mut()
968
- }
969
- },
970
- Err(e) => {
971
- set_last_error(format!("Failed to serialize presets: {}", e));
972
- ptr::null_mut()
973
- }
974
- }
975
- })
976
- }
977
-
978
- /// Get a specific embedding preset by name.
979
- ///
980
- /// # Safety
981
- ///
982
- /// - `name` must be a valid null-terminated C string
983
- /// - Returned string is JSON object and must be freed with `kreuzberg_free_string`
984
- /// - Returns NULL on error (check `kreuzberg_last_error`)
985
- #[unsafe(no_mangle)]
986
- pub unsafe extern "C" fn kreuzberg_get_embedding_preset(name: *const c_char) -> *mut c_char {
987
- ffi_panic_guard!("kreuzberg_get_embedding_preset", {
988
- clear_last_error();
989
-
990
- if name.is_null() {
991
- set_last_error("preset name cannot be NULL".to_string());
992
- return ptr::null_mut();
993
- }
994
-
995
- let preset_name = match unsafe { CStr::from_ptr(name) }.to_str() {
996
- Ok(s) => s,
997
- Err(e) => {
998
- set_last_error(format!("Invalid UTF-8 in preset name: {}", e));
999
- return ptr::null_mut();
1000
- }
1001
- };
1002
-
1003
- let preset = match kreuzberg::embeddings::get_preset(preset_name) {
1004
- Some(preset) => preset,
1005
- None => {
1006
- set_last_error(format!("Unknown embedding preset: {}", preset_name));
1007
- return ptr::null_mut();
1008
- }
1009
- };
1010
-
1011
- let model_name = format!("{:?}", preset.model);
1012
- let serializable = SerializableEmbeddingPreset {
1013
- name: preset.name,
1014
- chunk_size: preset.chunk_size,
1015
- overlap: preset.overlap,
1016
- model_name,
1017
- dimensions: preset.dimensions,
1018
- description: preset.description,
1019
- };
1020
-
1021
- match serde_json::to_string(&serializable) {
1022
- Ok(json) => match string_to_c_string(json) {
1023
- Ok(ptr) => ptr,
1024
- Err(e) => {
1025
- set_last_error(e);
1026
- ptr::null_mut()
1027
- }
1028
- },
1029
- Err(e) => {
1030
- set_last_error(format!("Failed to serialize embedding preset: {}", e));
1031
- ptr::null_mut()
1032
- }
1033
- }
1034
- })
1035
- }
1036
-
1037
- #[cfg(test)]
1038
- mod tests {
1039
- use super::*;
1040
- use std::ffi::CStr;
1041
-
1042
- #[test]
1043
- fn test_parse_minimal_config() {
1044
- let json = "{}";
1045
- let result = parse_extraction_config_from_json(json);
1046
- assert!(result.is_ok());
1047
- }
1048
-
1049
- #[test]
1050
- fn test_parse_config_with_use_cache() {
1051
- let json = r#"{"use_cache": true}"#;
1052
- let result = parse_extraction_config_from_json(json);
1053
- assert!(result.is_ok());
1054
- let config = result.unwrap();
1055
- assert!(config.use_cache);
1056
- }
1057
-
1058
- #[test]
1059
- fn test_parse_config_with_ocr() {
1060
- let json = r#"{"ocr": {"backend": "tesseract", "language": "eng"}}"#;
1061
- let result = parse_extraction_config_from_json(json);
1062
- assert!(result.is_ok());
1063
- let config = result.unwrap();
1064
- assert!(config.ocr.is_some());
1065
- let ocr = config.ocr.unwrap();
1066
- assert_eq!(ocr.backend, "tesseract");
1067
- assert_eq!(ocr.language, "eng");
1068
- }
1069
-
1070
- #[test]
1071
- fn test_parse_invalid_json() {
1072
- let json = "{invalid json}";
1073
- let result = parse_extraction_config_from_json(json);
1074
- assert!(result.is_err());
1075
- }
1076
-
1077
- #[test]
1078
- fn test_parse_complex_config() {
1079
- let json = r#"{
1080
- "use_cache": true,
1081
- "enable_quality_processing": true,
1082
- "force_ocr": false,
1083
- "ocr": {
1084
- "backend": "tesseract",
1085
- "language": "eng"
1086
- },
1087
- "chunking": {
1088
- "max_chars": 1024,
1089
- "max_overlap": 128
1090
- },
1091
- "max_concurrent_extractions": 4
1092
- }"#;
1093
- let result = parse_extraction_config_from_json(json);
1094
- assert!(result.is_ok());
1095
- }
1096
-
1097
- #[test]
1098
- fn test_config_to_json() {
1099
- let json_str = r#"{"use_cache": true}"#;
1100
- let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1101
- assert!(!config_ptr.is_null());
1102
-
1103
- let json_out = unsafe { kreuzberg_config_to_json(config_ptr) };
1104
- assert!(!json_out.is_null());
1105
-
1106
- let out_str = unsafe { CStr::from_ptr(json_out).to_str().unwrap() };
1107
- assert!(out_str.contains("use_cache"));
1108
- assert!(out_str.contains("true"));
1109
-
1110
- unsafe {
1111
- crate::kreuzberg_free_string(json_out);
1112
- kreuzberg_config_free(config_ptr);
1113
- }
1114
- }
1115
-
1116
- #[test]
1117
- fn test_config_to_json_null_pointer() {
1118
- let result = unsafe { kreuzberg_config_to_json(ptr::null()) };
1119
- assert!(result.is_null());
1120
- }
1121
-
1122
- #[test]
1123
- fn test_config_get_field_simple() {
1124
- let json_str = r#"{"use_cache": true}"#;
1125
- let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1126
- assert!(!config_ptr.is_null());
1127
-
1128
- let field_name = std::ffi::CString::new("use_cache").unwrap();
1129
- let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
1130
- assert!(!field_value.is_null());
1131
-
1132
- let value_str = unsafe { CStr::from_ptr(field_value).to_str().unwrap() };
1133
- assert_eq!(value_str, "true");
1134
-
1135
- unsafe {
1136
- crate::kreuzberg_free_string(field_value);
1137
- kreuzberg_config_free(config_ptr);
1138
- }
1139
- }
1140
-
1141
- #[test]
1142
- fn test_config_get_field_nested() {
1143
- let json_str = r#"{"ocr": {"backend": "tesseract"}}"#;
1144
- let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1145
- assert!(!config_ptr.is_null());
1146
-
1147
- let field_name = std::ffi::CString::new("ocr.backend").unwrap();
1148
- let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
1149
- assert!(!field_value.is_null());
1150
-
1151
- let value_str = unsafe { CStr::from_ptr(field_value).to_str().unwrap() };
1152
- assert_eq!(value_str, r#""tesseract""#);
1153
-
1154
- unsafe {
1155
- crate::kreuzberg_free_string(field_value);
1156
- kreuzberg_config_free(config_ptr);
1157
- }
1158
- }
1159
-
1160
- #[test]
1161
- fn test_config_get_field_missing() {
1162
- let json_str = r#"{"use_cache": true}"#;
1163
- let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1164
- assert!(!config_ptr.is_null());
1165
-
1166
- let field_name = std::ffi::CString::new("nonexistent").unwrap();
1167
- let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
1168
- assert!(field_value.is_null());
1169
-
1170
- unsafe {
1171
- kreuzberg_config_free(config_ptr);
1172
- }
1173
- }
1174
-
1175
- #[test]
1176
- fn test_config_get_field_null_pointer() {
1177
- let field_name = std::ffi::CString::new("use_cache").unwrap();
1178
- let result = unsafe { kreuzberg_config_get_field(ptr::null(), field_name.as_ptr()) };
1179
- assert!(result.is_null());
1180
- }
1181
-
1182
- #[test]
1183
- fn test_config_merge() {
1184
- let base_json = r#"{"use_cache": true, "force_ocr": false}"#;
1185
- let override_json = r#"{"force_ocr": true}"#;
1186
-
1187
- let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1188
- let override_ptr =
1189
- unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1190
-
1191
- assert!(!base_ptr.is_null());
1192
- assert!(!override_ptr.is_null());
1193
-
1194
- let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
1195
- assert_eq!(result, 1);
1196
-
1197
- let merged_json = unsafe { kreuzberg_config_to_json(base_ptr) };
1198
- assert!(!merged_json.is_null());
1199
-
1200
- let merged_str = unsafe { CStr::from_ptr(merged_json).to_str().unwrap() };
1201
- assert!(merged_str.contains("use_cache"));
1202
- assert!(merged_str.contains("force_ocr"));
1203
-
1204
- unsafe {
1205
- crate::kreuzberg_free_string(merged_json);
1206
- kreuzberg_config_free(base_ptr);
1207
- kreuzberg_config_free(override_ptr);
1208
- }
1209
- }
1210
-
1211
- #[test]
1212
- fn test_config_merge_null_base() {
1213
- let override_json = r#"{"force_ocr": true}"#;
1214
- let override_ptr =
1215
- unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1216
-
1217
- let result = unsafe { kreuzberg_config_merge(ptr::null_mut(), override_ptr) };
1218
- assert_eq!(result, 0);
1219
-
1220
- unsafe {
1221
- kreuzberg_config_free(override_ptr);
1222
- }
1223
- }
1224
-
1225
- #[test]
1226
- fn test_config_merge_null_override() {
1227
- let base_json = r#"{"use_cache": true}"#;
1228
- let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1229
-
1230
- let result = unsafe { kreuzberg_config_merge(base_ptr, ptr::null()) };
1231
- assert_eq!(result, 0);
1232
-
1233
- unsafe {
1234
- kreuzberg_config_free(base_ptr);
1235
- }
1236
- }
1237
-
1238
- #[test]
1239
- fn test_config_merge_override_to_default_value() {
1240
- let base_json = r#"{"use_cache": false}"#;
1241
- let override_json = r#"{"use_cache": true}"#;
1242
-
1243
- let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1244
- let override_ptr =
1245
- unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1246
-
1247
- assert!(!base_ptr.is_null());
1248
- assert!(!override_ptr.is_null());
1249
-
1250
- let base_ref = unsafe { &*base_ptr };
1251
- assert!(!base_ref.use_cache);
1252
-
1253
- let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
1254
- assert_eq!(result, 1);
1255
-
1256
- let base_ref = unsafe { &*base_ptr };
1257
- assert!(base_ref.use_cache, "override to default value should be applied");
1258
-
1259
- unsafe {
1260
- kreuzberg_config_free(base_ptr);
1261
- kreuzberg_config_free(override_ptr);
1262
- }
1263
- }
1264
-
1265
- #[test]
1266
- fn test_config_merge_override_force_ocr() {
1267
- let base_json = r#"{"force_ocr": false}"#;
1268
- let override_json = r#"{"force_ocr": true}"#;
1269
-
1270
- let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1271
- let override_ptr =
1272
- unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1273
-
1274
- assert!(!base_ptr.is_null());
1275
- assert!(!override_ptr.is_null());
1276
-
1277
- let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
1278
- assert_eq!(result, 1);
1279
-
1280
- let base_ref = unsafe { &*base_ptr };
1281
- assert!(base_ref.force_ocr);
1282
-
1283
- unsafe {
1284
- kreuzberg_config_free(base_ptr);
1285
- kreuzberg_config_free(override_ptr);
1286
- }
1287
- }
1288
-
1289
- #[test]
1290
- fn test_list_embedding_presets() {
1291
- let result = unsafe { kreuzberg_list_embedding_presets() };
1292
- assert!(!result.is_null());
1293
-
1294
- let presets_str = unsafe { CStr::from_ptr(result).to_str().unwrap() };
1295
- assert!(presets_str.starts_with('['));
1296
- assert!(presets_str.ends_with(']'));
1297
-
1298
- unsafe {
1299
- crate::kreuzberg_free_string(result);
1300
- }
1301
- }
1302
-
1303
- #[test]
1304
- fn test_get_embedding_preset_null() {
1305
- let result = unsafe { kreuzberg_get_embedding_preset(ptr::null()) };
1306
- assert!(result.is_null());
1307
- }
1308
-
1309
- #[test]
1310
- fn test_get_embedding_preset_unknown() {
1311
- let name = CString::new("nonexistent_preset").unwrap();
1312
- let result = unsafe { kreuzberg_get_embedding_preset(name.as_ptr()) };
1313
- assert!(result.is_null());
1314
- }
1315
-
1316
- #[test]
1317
- fn test_get_embedding_preset_valid() {
1318
- let name = CString::new("fast").unwrap();
1319
- let result = unsafe { kreuzberg_get_embedding_preset(name.as_ptr()) };
1320
- assert!(!result.is_null());
1321
-
1322
- let preset_str = unsafe { CStr::from_ptr(result).to_str().unwrap() };
1323
- assert!(preset_str.contains("name"));
1324
- assert!(preset_str.contains("chunk_size"));
1325
-
1326
- unsafe {
1327
- crate::kreuzberg_free_string(result);
1328
- }
1329
- }
1330
-
1331
- #[test]
1332
- fn test_config_discover_null_safe() {
1333
- let result = unsafe { kreuzberg_config_discover() };
1334
- // Result can be null if no config found, which is valid
1335
- if !result.is_null() {
1336
- unsafe {
1337
- crate::kreuzberg_free_string(result);
1338
- }
1339
- }
1340
- }
1341
- }