kreuzberg 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +24 -16
  3. data/README.md +4 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -1,1341 +0,0 @@
1
- //! Centralized FFI configuration parsing module.
2
- //!
3
- //! This module consolidates all configuration parsing logic that was previously
4
- //! duplicated across all language bindings (Python, TypeScript, Ruby, Java, Go, C#).
5
- //!
6
- //! Instead of each binding reimplementing config parsing from JSON, they now
7
- //! call the FFI functions provided here, ensuring:
8
- //! - Single source of truth for validation rules
9
- //! - Consistent behavior across all languages
10
- //! - Elimination of drift/inconsistencies
11
- //! - Better performance (no JSON round-trips in language bindings)
12
-
13
- use crate::ffi_panic_guard;
14
- use crate::helpers::{clear_last_error, set_last_error, string_to_c_string};
15
- use kreuzberg::KreuzbergError;
16
- use kreuzberg::core::config::ExtractionConfig;
17
- use serde::Serialize;
18
- use std::ffi::{CStr, CString};
19
- use std::os::raw::c_char;
20
- use std::path::Path;
21
- use std::ptr;
22
-
23
- type FfiResult<T> = std::result::Result<T, String>;
24
-
25
- /// Parse an ExtractionConfig from a JSON string.
26
- ///
27
- /// This is the primary FFI entry point for all language bindings to parse
28
- /// configuration from JSON. Replaces the need for each binding to implement
29
- /// its own JSON parsing logic.
30
- ///
31
- /// # Arguments
32
- ///
33
- /// * `json_config` - Null-terminated C string containing JSON configuration
34
- ///
35
- /// # Returns
36
- ///
37
- /// A pointer to an ExtractionConfig struct that MUST be freed with
38
- /// `kreuzberg_config_free`, or NULL on error (check kreuzberg_last_error).
39
- ///
40
- /// # Safety
41
- ///
42
- /// - `json_config` must be a valid null-terminated C string
43
- /// - The returned pointer must be freed with `kreuzberg_config_free`
44
- /// - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
45
- ///
46
- /// # Example (C)
47
- ///
48
- /// ```c
49
- /// const char* config_json = "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}";
50
- /// ExtractionConfig* config = kreuzberg_config_from_json(config_json);
51
- /// if (config == NULL) {
52
- /// printf("Error: %s\n", kreuzberg_last_error());
53
- /// return 1;
54
- /// }
55
- ///
56
- /// // Use config...
57
- /// // char* result = kreuzberg_extract_file_with_config("doc.pdf", config);
58
- ///
59
- /// kreuzberg_config_free(config);
60
- /// ```
61
- #[unsafe(no_mangle)]
62
- pub unsafe extern "C" fn kreuzberg_config_from_json(json_config: *const c_char) -> *mut ExtractionConfig {
63
- if json_config.is_null() {
64
- set_last_error("Config JSON cannot be NULL".to_string());
65
- return ptr::null_mut();
66
- }
67
-
68
- clear_last_error();
69
-
70
- let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
71
- Ok(s) => s,
72
- Err(e) => {
73
- set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
74
- return ptr::null_mut();
75
- }
76
- };
77
-
78
- match parse_extraction_config_from_json(json_str) {
79
- Ok(config) => Box::into_raw(Box::new(config)),
80
- Err(e) => {
81
- set_last_error(e);
82
- ptr::null_mut()
83
- }
84
- }
85
- }
86
-
87
- /// Free an ExtractionConfig allocated by kreuzberg_config_from_json or similar.
88
- ///
89
- /// # Safety
90
- ///
91
- /// - `config` must be a pointer previously returned by a config creation function
92
- /// - `config` can be NULL (no-op)
93
- /// - `config` must not be used after this call
94
- ///
95
- /// # Example (C)
96
- ///
97
- /// ```c
98
- /// ExtractionConfig* config = kreuzberg_config_from_json("{...}");
99
- /// if (config != NULL) {
100
- /// // Use config...
101
- /// kreuzberg_config_free(config);
102
- /// }
103
- /// ```
104
- #[unsafe(no_mangle)]
105
- pub unsafe extern "C" fn kreuzberg_config_free(config: *mut ExtractionConfig) {
106
- if !config.is_null() {
107
- let _ = unsafe { Box::from_raw(config) };
108
- }
109
- }
110
-
111
- /// Validate a JSON config string without parsing it.
112
- ///
113
- /// This function checks if a JSON config string is valid and would parse correctly,
114
- /// without allocating the full ExtractionConfig structure. Useful for validation
115
- /// before committing to parsing.
116
- ///
117
- /// # Arguments
118
- ///
119
- /// * `json_config` - Null-terminated C string containing JSON configuration
120
- ///
121
- /// # Returns
122
- ///
123
- /// - 1 if valid (would parse successfully)
124
- /// - 0 if invalid (check `kreuzberg_last_error` for details)
125
- ///
126
- /// # Safety
127
- ///
128
- /// - `json_config` must be a valid null-terminated C string
129
- ///
130
- /// # Example (C)
131
- ///
132
- /// ```c
133
- /// const char* config_json = "{\"use_cache\": true}";
134
- /// if (kreuzberg_config_is_valid(config_json)) {
135
- /// ExtractionConfig* config = kreuzberg_config_from_json(config_json);
136
- /// // Use config...
137
- /// kreuzberg_config_free(config);
138
- /// } else {
139
- /// printf("Invalid config: %s\n", kreuzberg_last_error());
140
- /// }
141
- /// ```
142
- #[unsafe(no_mangle)]
143
- pub unsafe extern "C" fn kreuzberg_config_is_valid(json_config: *const c_char) -> i32 {
144
- if json_config.is_null() {
145
- set_last_error("Config JSON cannot be NULL".to_string());
146
- return 0;
147
- }
148
-
149
- clear_last_error();
150
-
151
- let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
152
- Ok(s) => s,
153
- Err(e) => {
154
- set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
155
- return 0;
156
- }
157
- };
158
-
159
- match parse_extraction_config_from_json(json_str) {
160
- Ok(_) => 1,
161
- Err(e) => {
162
- set_last_error(e);
163
- 0
164
- }
165
- }
166
- }
167
-
168
- /// Serialize an ExtractionConfig to JSON string.
169
- ///
170
- /// Converts an ExtractionConfig structure to its JSON representation, allowing
171
- /// bindings to serialize configs without reimplementing serialization logic.
172
- ///
173
- /// # Arguments
174
- ///
175
- /// * `config` - Pointer to an ExtractionConfig structure
176
- ///
177
- /// # Returns
178
- ///
179
- /// A pointer to a C string containing JSON that MUST be freed with `kreuzberg_free_string`.
180
- /// Returns NULL on error (check `kreuzberg_last_error`).
181
- ///
182
- /// # Safety
183
- ///
184
- /// - `config` must be a valid pointer to an ExtractionConfig
185
- /// - `config` cannot be NULL
186
- /// - The returned pointer must be freed with `kreuzberg_free_string`
187
- ///
188
- /// # Example (C)
189
- ///
190
- /// ```c
191
- /// ExtractionConfig* config = kreuzberg_config_from_json("{\"use_cache\": true}");
192
- /// if (config != NULL) {
193
- /// char* json = kreuzberg_config_to_json(config);
194
- /// if (json != NULL) {
195
- /// printf("Serialized: %s\n", json);
196
- /// kreuzberg_free_string(json);
197
- /// }
198
- /// kreuzberg_config_free(config);
199
- /// }
200
- /// ```
201
- #[unsafe(no_mangle)]
202
- pub unsafe extern "C" fn kreuzberg_config_to_json(config: *const ExtractionConfig) -> *mut c_char {
203
- if config.is_null() {
204
- set_last_error("Config cannot be NULL".to_string());
205
- return ptr::null_mut();
206
- }
207
-
208
- clear_last_error();
209
-
210
- match serde_json::to_string(unsafe { &*config }) {
211
- Ok(json) => match std::ffi::CString::new(json) {
212
- Ok(c_string) => c_string.into_raw(),
213
- Err(e) => {
214
- set_last_error(format!("Failed to convert JSON to C string: {}", e));
215
- ptr::null_mut()
216
- }
217
- },
218
- Err(e) => {
219
- set_last_error(format!("Failed to serialize config to JSON: {}", e));
220
- ptr::null_mut()
221
- }
222
- }
223
- }
224
-
225
- /// Get a specific field from config as JSON string.
226
- ///
227
- /// Retrieves a nested field from the configuration by path and returns its JSON
228
- /// representation. Supports dot notation for nested fields (e.g., "ocr.backend").
229
- ///
230
- /// # Arguments
231
- ///
232
- /// * `config` - Pointer to an ExtractionConfig structure
233
- /// * `field_name` - Null-terminated C string with field path (e.g., "use_cache", "ocr.backend")
234
- ///
235
- /// # Returns
236
- ///
237
- /// A pointer to a C string containing the field value as JSON, or NULL if:
238
- /// - The field doesn't exist
239
- /// - An error occurs during serialization
240
- ///
241
- /// The returned pointer (if non-NULL) must be freed with `kreuzberg_free_string`.
242
- ///
243
- /// # Safety
244
- ///
245
- /// - `config` must be a valid pointer to an ExtractionConfig
246
- /// - `field_name` must be a valid null-terminated C string
247
- /// - Neither parameter can be NULL
248
- ///
249
- /// # Example (C)
250
- ///
251
- /// ```c
252
- /// ExtractionConfig* config = kreuzberg_config_from_json(
253
- /// "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}"
254
- /// );
255
- /// if (config != NULL) {
256
- /// char* use_cache = kreuzberg_config_get_field(config, "use_cache");
257
- /// char* backend = kreuzberg_config_get_field(config, "ocr.backend");
258
- ///
259
- /// if (use_cache != NULL) {
260
- /// printf("use_cache: %s\n", use_cache);
261
- /// kreuzberg_free_string(use_cache);
262
- /// }
263
- ///
264
- /// if (backend != NULL) {
265
- /// printf("backend: %s\n", backend);
266
- /// kreuzberg_free_string(backend);
267
- /// }
268
- ///
269
- /// kreuzberg_config_free(config);
270
- /// }
271
- /// ```
272
- #[unsafe(no_mangle)]
273
- pub unsafe extern "C" fn kreuzberg_config_get_field(
274
- config: *const ExtractionConfig,
275
- field_name: *const c_char,
276
- ) -> *mut c_char {
277
- if config.is_null() {
278
- set_last_error("Config cannot be NULL".to_string());
279
- return ptr::null_mut();
280
- }
281
-
282
- if field_name.is_null() {
283
- set_last_error("Field name cannot be NULL".to_string());
284
- return ptr::null_mut();
285
- }
286
-
287
- clear_last_error();
288
-
289
- let field_str = match unsafe { CStr::from_ptr(field_name) }.to_str() {
290
- Ok(s) => s,
291
- Err(e) => {
292
- set_last_error(format!("Invalid UTF-8 in field name: {}", e));
293
- return ptr::null_mut();
294
- }
295
- };
296
-
297
- let json_value = match serde_json::to_value(unsafe { &*config }) {
298
- Ok(val) => val,
299
- Err(e) => {
300
- set_last_error(format!("Failed to serialize config: {}", e));
301
- return ptr::null_mut();
302
- }
303
- };
304
-
305
- let mut current = &json_value;
306
- for part in field_str.split('.') {
307
- if let Some(obj) = current.as_object() {
308
- match obj.get(part) {
309
- Some(val) => current = val,
310
- None => {
311
- set_last_error(format!("Field '{}' not found in config", field_str));
312
- return ptr::null_mut();
313
- }
314
- }
315
- } else {
316
- set_last_error(format!("Cannot access nested field '{}' in non-object", part));
317
- return ptr::null_mut();
318
- }
319
- }
320
-
321
- match serde_json::to_string(current) {
322
- Ok(json) => match std::ffi::CString::new(json) {
323
- Ok(c_string) => c_string.into_raw(),
324
- Err(e) => {
325
- set_last_error(format!("Failed to convert field value to C string: {}", e));
326
- ptr::null_mut()
327
- }
328
- },
329
- Err(e) => {
330
- set_last_error(format!("Failed to serialize field value: {}", e));
331
- ptr::null_mut()
332
- }
333
- }
334
- }
335
-
336
- /// Merge two configs (override takes precedence over base).
337
- ///
338
- /// Performs a shallow merge of two ExtractionConfig structures, where fields
339
- /// from `override_config` take precedence over fields in `base`. The `base`
340
- /// config is modified in-place.
341
- ///
342
- /// # Arguments
343
- ///
344
- /// * `base` - Pointer to the base ExtractionConfig (will be modified)
345
- /// * `override_config` - Pointer to the override ExtractionConfig (read-only)
346
- ///
347
- /// # Returns
348
- ///
349
- /// - 1 on success
350
- /// - 0 on error (check `kreuzberg_last_error`)
351
- ///
352
- /// # Safety
353
- ///
354
- /// - `base` must be a valid mutable pointer to an ExtractionConfig
355
- /// - `override_config` must be a valid pointer to an ExtractionConfig
356
- /// - Neither parameter can be NULL
357
- /// - `base` is modified in-place
358
- ///
359
- /// # Example (C)
360
- ///
361
- /// ```c
362
- /// ExtractionConfig* base = kreuzberg_config_from_json(
363
- /// "{\"use_cache\": true, \"force_ocr\": false}"
364
- /// );
365
- /// ExtractionConfig* override = kreuzberg_config_from_json(
366
- /// "{\"force_ocr\": true}"
367
- /// );
368
- ///
369
- /// if (kreuzberg_config_merge(base, override) == 1) {
370
- /// // base now has: use_cache=true, force_ocr=true
371
- /// char* json = kreuzberg_config_to_json(base);
372
- /// printf("Merged config: %s\n", json);
373
- /// kreuzberg_free_string(json);
374
- /// }
375
- ///
376
- /// kreuzberg_config_free(base);
377
- /// kreuzberg_config_free(override);
378
- /// ```
379
- #[unsafe(no_mangle)]
380
- pub unsafe extern "C" fn kreuzberg_config_merge(
381
- base: *mut ExtractionConfig,
382
- override_config: *const ExtractionConfig,
383
- ) -> i32 {
384
- if base.is_null() {
385
- set_last_error("Base config cannot be NULL".to_string());
386
- return 0;
387
- }
388
-
389
- if override_config.is_null() {
390
- set_last_error("Override config cannot be NULL".to_string());
391
- return 0;
392
- }
393
-
394
- clear_last_error();
395
-
396
- let base_ref = unsafe { &mut *base };
397
- let override_ref = unsafe { &*override_config };
398
-
399
- base_ref.use_cache = override_ref.use_cache;
400
- base_ref.enable_quality_processing = override_ref.enable_quality_processing;
401
- base_ref.force_ocr = override_ref.force_ocr;
402
- base_ref.max_concurrent_extractions = override_ref.max_concurrent_extractions;
403
-
404
- if override_ref.ocr.is_some() {
405
- base_ref.ocr = override_ref.ocr.clone();
406
- }
407
-
408
- if override_ref.chunking.is_some() {
409
- base_ref.chunking = override_ref.chunking.clone();
410
- }
411
-
412
- if override_ref.images.is_some() {
413
- base_ref.images = override_ref.images.clone();
414
- }
415
-
416
- #[cfg(feature = "pdf")]
417
- if override_ref.pdf_options.is_some() {
418
- base_ref.pdf_options = override_ref.pdf_options.clone();
419
- }
420
-
421
- if override_ref.token_reduction.is_some() {
422
- base_ref.token_reduction = override_ref.token_reduction.clone();
423
- }
424
-
425
- if override_ref.language_detection.is_some() {
426
- base_ref.language_detection = override_ref.language_detection.clone();
427
- }
428
-
429
- if override_ref.pages.is_some() {
430
- base_ref.pages = override_ref.pages.clone();
431
- }
432
-
433
- #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
434
- if override_ref.keywords.is_some() {
435
- base_ref.keywords = override_ref.keywords.clone();
436
- }
437
-
438
- if override_ref.postprocessor.is_some() {
439
- base_ref.postprocessor = override_ref.postprocessor.clone();
440
- }
441
-
442
- if override_ref.html_options.is_some() {
443
- base_ref.html_options = override_ref.html_options.clone();
444
- }
445
-
446
- 1
447
- }
448
-
449
- /// Parse ExtractionConfig from JSON string.
450
- ///
451
- /// This is the core parsing logic shared by all FFI functions that deal with
452
- /// JSON configuration. It handles:
453
- /// - JSON deserialization
454
- /// - All validation rules
455
- /// - Type conversions
456
- /// - HTML options parsing (complex nested structure)
457
- ///
458
- /// The error messages are user-friendly and include guidance on what went wrong.
459
- fn parse_extraction_config_from_json(json_str: &str) -> FfiResult<ExtractionConfig> {
460
- use html_to_markdown_rs::options::{
461
- CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
462
- PreprocessingPreset, WhitespaceMode,
463
- };
464
-
465
- // ~keep: This function performs the JSON parsing and validation that was
466
-
467
- fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
468
- where
469
- F: Fn(&str) -> FfiResult<T>,
470
- {
471
- if let Some(raw) = value {
472
- let text = raw
473
- .as_str()
474
- .ok_or_else(|| "Expected string for enum field".to_string())?;
475
- return parse_fn(text).map(Some);
476
- }
477
- Ok(None)
478
- }
479
-
480
- fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
481
- match value.to_lowercase().as_str() {
482
- "atx" => Ok(HeadingStyle::Atx),
483
- "underlined" => Ok(HeadingStyle::Underlined),
484
- "atx_closed" => Ok(HeadingStyle::AtxClosed),
485
- other => Err(format!(
486
- "Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
487
- other
488
- )),
489
- }
490
- }
491
-
492
- fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
493
- match value.to_lowercase().as_str() {
494
- "spaces" => Ok(ListIndentType::Spaces),
495
- "tabs" => Ok(ListIndentType::Tabs),
496
- other => Err(format!(
497
- "Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
498
- other
499
- )),
500
- }
501
- }
502
-
503
- fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
504
- match value.to_lowercase().as_str() {
505
- "double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
506
- "html" => Ok(HighlightStyle::Html),
507
- "bold" => Ok(HighlightStyle::Bold),
508
- "none" => Ok(HighlightStyle::None),
509
- other => Err(format!(
510
- "Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
511
- other
512
- )),
513
- }
514
- }
515
-
516
- fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
517
- match value.to_lowercase().as_str() {
518
- "normalized" => Ok(WhitespaceMode::Normalized),
519
- "strict" => Ok(WhitespaceMode::Strict),
520
- other => Err(format!(
521
- "Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
522
- other
523
- )),
524
- }
525
- }
526
-
527
- fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
528
- match value.to_lowercase().as_str() {
529
- "spaces" => Ok(NewlineStyle::Spaces),
530
- "backslash" => Ok(NewlineStyle::Backslash),
531
- other => Err(format!(
532
- "Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
533
- other
534
- )),
535
- }
536
- }
537
-
538
- fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
539
- match value.to_lowercase().as_str() {
540
- "indented" => Ok(CodeBlockStyle::Indented),
541
- "backticks" => Ok(CodeBlockStyle::Backticks),
542
- "tildes" => Ok(CodeBlockStyle::Tildes),
543
- other => Err(format!(
544
- "Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
545
- other
546
- )),
547
- }
548
- }
549
-
550
- #[allow(dead_code)]
551
- fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
552
- match value.to_lowercase().as_str() {
553
- "minimal" => Ok(PreprocessingPreset::Minimal),
554
- "standard" => Ok(PreprocessingPreset::Standard),
555
- "aggressive" => Ok(PreprocessingPreset::Aggressive),
556
- other => Err(format!(
557
- "Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
558
- other
559
- )),
560
- }
561
- }
562
-
563
- fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
564
- let mut opts = ConversionOptions::default();
565
- let obj = value
566
- .as_object()
567
- .ok_or_else(|| "html_options must be an object".to_string())?;
568
-
569
- if let Some(val) = obj.get("heading_style") {
570
- opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
571
- }
572
-
573
- if let Some(val) = obj.get("list_indent_type") {
574
- opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
575
- }
576
-
577
- if let Some(val) = obj.get("list_indent_width") {
578
- opts.list_indent_width = val
579
- .as_u64()
580
- .map(|v| v as usize)
581
- .ok_or_else(|| "list_indent_width must be an integer".to_string())?;
582
- }
583
-
584
- if let Some(val) = obj.get("bullets") {
585
- opts.bullets = val
586
- .as_str()
587
- .map(str::to_string)
588
- .ok_or_else(|| "bullets must be a string".to_string())?;
589
- }
590
-
591
- if let Some(val) = obj.get("strong_em_symbol") {
592
- let symbol = val
593
- .as_str()
594
- .ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
595
- let mut chars = symbol.chars();
596
- opts.strong_em_symbol = chars
597
- .next()
598
- .ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
599
- }
600
-
601
- if let Some(val) = obj.get("escape_asterisks") {
602
- opts.escape_asterisks = val
603
- .as_bool()
604
- .ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
605
- }
606
-
607
- if let Some(val) = obj.get("escape_underscores") {
608
- opts.escape_underscores = val
609
- .as_bool()
610
- .ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
611
- }
612
-
613
- if let Some(val) = obj.get("escape_misc") {
614
- opts.escape_misc = val
615
- .as_bool()
616
- .ok_or_else(|| "escape_misc must be a boolean".to_string())?;
617
- }
618
-
619
- if let Some(val) = obj.get("escape_ascii") {
620
- opts.escape_ascii = val
621
- .as_bool()
622
- .ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
623
- }
624
-
625
- if let Some(val) = obj.get("code_language") {
626
- opts.code_language = val
627
- .as_str()
628
- .map(str::to_string)
629
- .ok_or_else(|| "code_language must be a string".to_string())?;
630
- }
631
-
632
- if let Some(val) = obj.get("autolinks") {
633
- opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
634
- }
635
-
636
- if let Some(val) = obj.get("default_title") {
637
- opts.default_title = val
638
- .as_bool()
639
- .ok_or_else(|| "default_title must be a boolean".to_string())?;
640
- }
641
-
642
- if let Some(val) = obj.get("br_in_tables") {
643
- opts.br_in_tables = val
644
- .as_bool()
645
- .ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
646
- }
647
-
648
- if let Some(val) = obj.get("hocr_spatial_tables") {
649
- opts.hocr_spatial_tables = val
650
- .as_bool()
651
- .ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
652
- }
653
-
654
- if let Some(val) = obj.get("highlight_style") {
655
- opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
656
- }
657
-
658
- if let Some(val) = obj.get("extract_metadata") {
659
- opts.extract_metadata = val
660
- .as_bool()
661
- .ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
662
- }
663
-
664
- if let Some(val) = obj.get("whitespace_mode") {
665
- opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
666
- }
667
-
668
- if let Some(val) = obj.get("strip_newlines") {
669
- opts.strip_newlines = val
670
- .as_bool()
671
- .ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
672
- }
673
-
674
- if let Some(val) = obj.get("wrap") {
675
- opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
676
- }
677
-
678
- if let Some(val) = obj.get("wrap_width") {
679
- opts.wrap_width = val
680
- .as_u64()
681
- .map(|v| v as usize)
682
- .ok_or_else(|| "wrap_width must be an integer".to_string())?;
683
- }
684
-
685
- if let Some(val) = obj.get("convert_as_inline") {
686
- opts.convert_as_inline = val
687
- .as_bool()
688
- .ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
689
- }
690
-
691
- if let Some(val) = obj.get("sub_symbol") {
692
- opts.sub_symbol = val
693
- .as_str()
694
- .map(str::to_string)
695
- .ok_or_else(|| "sub_symbol must be a string".to_string())?;
696
- }
697
-
698
- if let Some(val) = obj.get("sup_symbol") {
699
- opts.sup_symbol = val
700
- .as_str()
701
- .map(str::to_string)
702
- .ok_or_else(|| "sup_symbol must be a string".to_string())?;
703
- }
704
-
705
- if let Some(val) = obj.get("newline_style") {
706
- opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
707
- }
708
-
709
- if let Some(val) = obj.get("code_block_style") {
710
- opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
711
- }
712
-
713
- if let Some(val) = obj.get("keep_inline_images_in") {
714
- opts.keep_inline_images_in = val
715
- .as_array()
716
- .ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
717
- .iter()
718
- .map(|v| {
719
- v.as_str()
720
- .map(str::to_string)
721
- .ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
722
- })
723
- .collect::<FfiResult<Vec<_>>>()?;
724
- }
725
-
726
- if let Some(val) = obj.get("encoding") {
727
- opts.encoding = val
728
- .as_str()
729
- .map(str::to_string)
730
- .ok_or_else(|| "encoding must be a string".to_string())?;
731
- }
732
-
733
- if let Some(val) = obj.get("debug") {
734
- opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
735
- }
736
-
737
- if let Some(val) = obj.get("strip_tags") {
738
- opts.strip_tags = val
739
- .as_array()
740
- .ok_or_else(|| "strip_tags must be an array".to_string())?
741
- .iter()
742
- .map(|v| {
743
- v.as_str()
744
- .map(str::to_string)
745
- .ok_or_else(|| "strip_tags entries must be strings".to_string())
746
- })
747
- .collect::<FfiResult<Vec<_>>>()?;
748
- }
749
-
750
- if let Some(val) = obj.get("preserve_tags") {
751
- opts.preserve_tags = val
752
- .as_array()
753
- .ok_or_else(|| "preserve_tags must be an array".to_string())?
754
- .iter()
755
- .map(|v| {
756
- v.as_str()
757
- .map(str::to_string)
758
- .ok_or_else(|| "preserve_tags entries must be strings".to_string())
759
- })
760
- .collect::<FfiResult<Vec<_>>>()?;
761
- }
762
-
763
- Ok(opts)
764
- }
765
-
766
- let json_value: serde_json::Value = serde_json::from_str(json_str).map_err(|e| format!("Invalid JSON: {}", e))?;
767
-
768
- let mut config: ExtractionConfig =
769
- serde_json::from_value(json_value.clone()).map_err(|e| format!("Invalid configuration structure: {}", e))?;
770
-
771
- if let Some(html_opts_val) = json_value.get("html_options") {
772
- config.html_options = Some(parse_html_options(html_opts_val)?);
773
- }
774
-
775
- Ok(config)
776
- }
777
-
778
- /// SerializableEmbeddingPreset for FFI serialization.
779
- #[derive(Serialize)]
780
- struct SerializableEmbeddingPreset<'a> {
781
- name: &'a str,
782
- chunk_size: usize,
783
- overlap: usize,
784
- model_name: String,
785
- dimensions: usize,
786
- description: &'a str,
787
- }
788
-
789
- /// Load an ExtractionConfig from a file.
790
- ///
791
- /// Returns a JSON string representing the loaded configuration.
792
- ///
793
- /// # Safety
794
- ///
795
- /// - `file_path` must be a valid null-terminated C string
796
- /// - The returned string must be freed with `kreuzberg_free_string`
797
- /// - Returns NULL on error (check `kreuzberg_last_error`)
798
- #[unsafe(no_mangle)]
799
- pub unsafe extern "C" fn kreuzberg_load_extraction_config_from_file(file_path: *const c_char) -> *mut c_char {
800
- ffi_panic_guard!("kreuzberg_load_extraction_config_from_file", {
801
- clear_last_error();
802
-
803
- if file_path.is_null() {
804
- set_last_error("file_path cannot be NULL".to_string());
805
- return ptr::null_mut();
806
- }
807
-
808
- let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
809
- Ok(s) => s,
810
- Err(e) => {
811
- set_last_error(format!("Invalid UTF-8 in file path: {}", e));
812
- return ptr::null_mut();
813
- }
814
- };
815
-
816
- match ExtractionConfig::from_file(path_str) {
817
- Ok(config) => match serde_json::to_string(&config) {
818
- Ok(json) => match CString::new(json) {
819
- Ok(cstr) => cstr.into_raw(),
820
- Err(e) => {
821
- set_last_error(format!("Failed to create C string: {}", e));
822
- ptr::null_mut()
823
- }
824
- },
825
- Err(e) => {
826
- set_last_error(format!("Failed to serialize config to JSON: {}", e));
827
- ptr::null_mut()
828
- }
829
- },
830
- Err(e) => {
831
- set_last_error(e.to_string());
832
- ptr::null_mut()
833
- }
834
- }
835
- })
836
- }
837
-
838
- /// Load an ExtractionConfig from a file (returns pointer to config struct).
839
- ///
840
- /// # Safety
841
- ///
842
- /// - `path` must be a valid null-terminated C string
843
- /// - The returned pointer must be freed with `kreuzberg_config_free`
844
- /// - Returns NULL on error (check `kreuzberg_last_error`)
845
- ///
846
- /// # Example (C)
847
- ///
848
- /// ```c
849
- /// ExtractionConfig* config = kreuzberg_config_from_file("config.toml");
850
- /// if (config == NULL) {
851
- /// printf("Error: %s\n", kreuzberg_last_error());
852
- /// return 1;
853
- /// }
854
- /// kreuzberg_config_free(config);
855
- /// ```
856
- #[unsafe(no_mangle)]
857
- pub unsafe extern "C" fn kreuzberg_config_from_file(path: *const c_char) -> *mut ExtractionConfig {
858
- ffi_panic_guard!("kreuzberg_config_from_file", {
859
- clear_last_error();
860
-
861
- if path.is_null() {
862
- set_last_error("Config path cannot be NULL".to_string());
863
- return ptr::null_mut();
864
- }
865
-
866
- let path_str = match unsafe { CStr::from_ptr(path) }.to_str() {
867
- Ok(s) => s,
868
- Err(e) => {
869
- set_last_error(format!("Invalid UTF-8 in config path: {}", e));
870
- return ptr::null_mut();
871
- }
872
- };
873
-
874
- let path_buf = Path::new(path_str);
875
-
876
- match ExtractionConfig::from_file(path_buf) {
877
- Ok(config) => Box::into_raw(Box::new(config)),
878
- Err(e) => {
879
- match &e {
880
- KreuzbergError::Io(io_err) => {
881
- set_last_error(format!("IO error loading config: {}", io_err));
882
- }
883
- _ => {
884
- set_last_error(format!("Failed to load config from file: {}", e));
885
- }
886
- }
887
- ptr::null_mut()
888
- }
889
- }
890
- })
891
- }
892
-
893
- /// Discover and load an ExtractionConfig by searching parent directories.
894
- ///
895
- /// Searches the current directory and all parent directories for:
896
- /// - `kreuzberg.toml`
897
- /// - `kreuzberg.json`
898
- ///
899
- /// Returns the first config file found as a JSON string.
900
- ///
901
- /// # Safety
902
- ///
903
- /// - The returned string must be freed with `kreuzberg_free_string`
904
- /// - Returns NULL if no config is found or on error
905
- ///
906
- /// # Example (C)
907
- ///
908
- /// ```c
909
- /// char* config_json = kreuzberg_config_discover();
910
- /// if (config_json != NULL) {
911
- /// printf("Discovered config: %s\n", config_json);
912
- /// kreuzberg_free_string(config_json);
913
- /// }
914
- /// ```
915
- #[unsafe(no_mangle)]
916
- pub unsafe extern "C" fn kreuzberg_config_discover() -> *mut c_char {
917
- ffi_panic_guard!("kreuzberg_config_discover", {
918
- clear_last_error();
919
-
920
- match ExtractionConfig::discover() {
921
- Ok(Some(config)) => match serde_json::to_string(&config) {
922
- Ok(json) => match CString::new(json) {
923
- Ok(cstr) => cstr.into_raw(),
924
- Err(e) => {
925
- set_last_error(format!("Failed to serialize config: {}", e));
926
- ptr::null_mut()
927
- }
928
- },
929
- Err(e) => {
930
- set_last_error(format!("Failed to serialize config: {}", e));
931
- ptr::null_mut()
932
- }
933
- },
934
- Ok(None) => ptr::null_mut(),
935
- Err(e) => {
936
- match &e {
937
- KreuzbergError::Io(io_err) => {
938
- set_last_error(format!("IO error discovering config: {}", io_err));
939
- }
940
- _ => {
941
- set_last_error(format!("Failed to discover config: {}", e));
942
- }
943
- }
944
- ptr::null_mut()
945
- }
946
- }
947
- })
948
- }
949
-
950
- /// List available embedding preset names.
951
- ///
952
- /// # Safety
953
- ///
954
- /// - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
955
- /// - Returns NULL on error (check `kreuzberg_last_error`)
956
- #[unsafe(no_mangle)]
957
- pub unsafe extern "C" fn kreuzberg_list_embedding_presets() -> *mut c_char {
958
- ffi_panic_guard!("kreuzberg_list_embedding_presets", {
959
- clear_last_error();
960
-
961
- let presets = kreuzberg::embeddings::list_presets();
962
- match serde_json::to_string(&presets) {
963
- Ok(json) => match string_to_c_string(json) {
964
- Ok(ptr) => ptr,
965
- Err(e) => {
966
- set_last_error(e);
967
- ptr::null_mut()
968
- }
969
- },
970
- Err(e) => {
971
- set_last_error(format!("Failed to serialize presets: {}", e));
972
- ptr::null_mut()
973
- }
974
- }
975
- })
976
- }
977
-
978
- /// Get a specific embedding preset by name.
979
- ///
980
- /// # Safety
981
- ///
982
- /// - `name` must be a valid null-terminated C string
983
- /// - Returned string is JSON object and must be freed with `kreuzberg_free_string`
984
- /// - Returns NULL on error (check `kreuzberg_last_error`)
985
- #[unsafe(no_mangle)]
986
- pub unsafe extern "C" fn kreuzberg_get_embedding_preset(name: *const c_char) -> *mut c_char {
987
- ffi_panic_guard!("kreuzberg_get_embedding_preset", {
988
- clear_last_error();
989
-
990
- if name.is_null() {
991
- set_last_error("preset name cannot be NULL".to_string());
992
- return ptr::null_mut();
993
- }
994
-
995
- let preset_name = match unsafe { CStr::from_ptr(name) }.to_str() {
996
- Ok(s) => s,
997
- Err(e) => {
998
- set_last_error(format!("Invalid UTF-8 in preset name: {}", e));
999
- return ptr::null_mut();
1000
- }
1001
- };
1002
-
1003
- let preset = match kreuzberg::embeddings::get_preset(preset_name) {
1004
- Some(preset) => preset,
1005
- None => {
1006
- set_last_error(format!("Unknown embedding preset: {}", preset_name));
1007
- return ptr::null_mut();
1008
- }
1009
- };
1010
-
1011
- let model_name = format!("{:?}", preset.model);
1012
- let serializable = SerializableEmbeddingPreset {
1013
- name: preset.name,
1014
- chunk_size: preset.chunk_size,
1015
- overlap: preset.overlap,
1016
- model_name,
1017
- dimensions: preset.dimensions,
1018
- description: preset.description,
1019
- };
1020
-
1021
- match serde_json::to_string(&serializable) {
1022
- Ok(json) => match string_to_c_string(json) {
1023
- Ok(ptr) => ptr,
1024
- Err(e) => {
1025
- set_last_error(e);
1026
- ptr::null_mut()
1027
- }
1028
- },
1029
- Err(e) => {
1030
- set_last_error(format!("Failed to serialize embedding preset: {}", e));
1031
- ptr::null_mut()
1032
- }
1033
- }
1034
- })
1035
- }
1036
-
1037
- #[cfg(test)]
1038
- mod tests {
1039
- use super::*;
1040
- use std::ffi::CStr;
1041
-
1042
- #[test]
1043
- fn test_parse_minimal_config() {
1044
- let json = "{}";
1045
- let result = parse_extraction_config_from_json(json);
1046
- assert!(result.is_ok());
1047
- }
1048
-
1049
- #[test]
1050
- fn test_parse_config_with_use_cache() {
1051
- let json = r#"{"use_cache": true}"#;
1052
- let result = parse_extraction_config_from_json(json);
1053
- assert!(result.is_ok());
1054
- let config = result.unwrap();
1055
- assert!(config.use_cache);
1056
- }
1057
-
1058
- #[test]
1059
- fn test_parse_config_with_ocr() {
1060
- let json = r#"{"ocr": {"backend": "tesseract", "language": "eng"}}"#;
1061
- let result = parse_extraction_config_from_json(json);
1062
- assert!(result.is_ok());
1063
- let config = result.unwrap();
1064
- assert!(config.ocr.is_some());
1065
- let ocr = config.ocr.unwrap();
1066
- assert_eq!(ocr.backend, "tesseract");
1067
- assert_eq!(ocr.language, "eng");
1068
- }
1069
-
1070
- #[test]
1071
- fn test_parse_invalid_json() {
1072
- let json = "{invalid json}";
1073
- let result = parse_extraction_config_from_json(json);
1074
- assert!(result.is_err());
1075
- }
1076
-
1077
- #[test]
1078
- fn test_parse_complex_config() {
1079
- let json = r#"{
1080
- "use_cache": true,
1081
- "enable_quality_processing": true,
1082
- "force_ocr": false,
1083
- "ocr": {
1084
- "backend": "tesseract",
1085
- "language": "eng"
1086
- },
1087
- "chunking": {
1088
- "max_chars": 1024,
1089
- "max_overlap": 128
1090
- },
1091
- "max_concurrent_extractions": 4
1092
- }"#;
1093
- let result = parse_extraction_config_from_json(json);
1094
- assert!(result.is_ok());
1095
- }
1096
-
1097
- #[test]
1098
- fn test_config_to_json() {
1099
- let json_str = r#"{"use_cache": true}"#;
1100
- let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1101
- assert!(!config_ptr.is_null());
1102
-
1103
- let json_out = unsafe { kreuzberg_config_to_json(config_ptr) };
1104
- assert!(!json_out.is_null());
1105
-
1106
- let out_str = unsafe { CStr::from_ptr(json_out).to_str().unwrap() };
1107
- assert!(out_str.contains("use_cache"));
1108
- assert!(out_str.contains("true"));
1109
-
1110
- unsafe {
1111
- crate::kreuzberg_free_string(json_out);
1112
- kreuzberg_config_free(config_ptr);
1113
- }
1114
- }
1115
-
1116
- #[test]
1117
- fn test_config_to_json_null_pointer() {
1118
- let result = unsafe { kreuzberg_config_to_json(ptr::null()) };
1119
- assert!(result.is_null());
1120
- }
1121
-
1122
- #[test]
1123
- fn test_config_get_field_simple() {
1124
- let json_str = r#"{"use_cache": true}"#;
1125
- let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1126
- assert!(!config_ptr.is_null());
1127
-
1128
- let field_name = std::ffi::CString::new("use_cache").unwrap();
1129
- let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
1130
- assert!(!field_value.is_null());
1131
-
1132
- let value_str = unsafe { CStr::from_ptr(field_value).to_str().unwrap() };
1133
- assert_eq!(value_str, "true");
1134
-
1135
- unsafe {
1136
- crate::kreuzberg_free_string(field_value);
1137
- kreuzberg_config_free(config_ptr);
1138
- }
1139
- }
1140
-
1141
- #[test]
1142
- fn test_config_get_field_nested() {
1143
- let json_str = r#"{"ocr": {"backend": "tesseract"}}"#;
1144
- let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1145
- assert!(!config_ptr.is_null());
1146
-
1147
- let field_name = std::ffi::CString::new("ocr.backend").unwrap();
1148
- let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
1149
- assert!(!field_value.is_null());
1150
-
1151
- let value_str = unsafe { CStr::from_ptr(field_value).to_str().unwrap() };
1152
- assert_eq!(value_str, r#""tesseract""#);
1153
-
1154
- unsafe {
1155
- crate::kreuzberg_free_string(field_value);
1156
- kreuzberg_config_free(config_ptr);
1157
- }
1158
- }
1159
-
1160
- #[test]
1161
- fn test_config_get_field_missing() {
1162
- let json_str = r#"{"use_cache": true}"#;
1163
- let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
1164
- assert!(!config_ptr.is_null());
1165
-
1166
- let field_name = std::ffi::CString::new("nonexistent").unwrap();
1167
- let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
1168
- assert!(field_value.is_null());
1169
-
1170
- unsafe {
1171
- kreuzberg_config_free(config_ptr);
1172
- }
1173
- }
1174
-
1175
- #[test]
1176
- fn test_config_get_field_null_pointer() {
1177
- let field_name = std::ffi::CString::new("use_cache").unwrap();
1178
- let result = unsafe { kreuzberg_config_get_field(ptr::null(), field_name.as_ptr()) };
1179
- assert!(result.is_null());
1180
- }
1181
-
1182
- #[test]
1183
- fn test_config_merge() {
1184
- let base_json = r#"{"use_cache": true, "force_ocr": false}"#;
1185
- let override_json = r#"{"force_ocr": true}"#;
1186
-
1187
- let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1188
- let override_ptr =
1189
- unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1190
-
1191
- assert!(!base_ptr.is_null());
1192
- assert!(!override_ptr.is_null());
1193
-
1194
- let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
1195
- assert_eq!(result, 1);
1196
-
1197
- let merged_json = unsafe { kreuzberg_config_to_json(base_ptr) };
1198
- assert!(!merged_json.is_null());
1199
-
1200
- let merged_str = unsafe { CStr::from_ptr(merged_json).to_str().unwrap() };
1201
- assert!(merged_str.contains("use_cache"));
1202
- assert!(merged_str.contains("force_ocr"));
1203
-
1204
- unsafe {
1205
- crate::kreuzberg_free_string(merged_json);
1206
- kreuzberg_config_free(base_ptr);
1207
- kreuzberg_config_free(override_ptr);
1208
- }
1209
- }
1210
-
1211
- #[test]
1212
- fn test_config_merge_null_base() {
1213
- let override_json = r#"{"force_ocr": true}"#;
1214
- let override_ptr =
1215
- unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1216
-
1217
- let result = unsafe { kreuzberg_config_merge(ptr::null_mut(), override_ptr) };
1218
- assert_eq!(result, 0);
1219
-
1220
- unsafe {
1221
- kreuzberg_config_free(override_ptr);
1222
- }
1223
- }
1224
-
1225
- #[test]
1226
- fn test_config_merge_null_override() {
1227
- let base_json = r#"{"use_cache": true}"#;
1228
- let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1229
-
1230
- let result = unsafe { kreuzberg_config_merge(base_ptr, ptr::null()) };
1231
- assert_eq!(result, 0);
1232
-
1233
- unsafe {
1234
- kreuzberg_config_free(base_ptr);
1235
- }
1236
- }
1237
-
1238
- #[test]
1239
- fn test_config_merge_override_to_default_value() {
1240
- let base_json = r#"{"use_cache": false}"#;
1241
- let override_json = r#"{"use_cache": true}"#;
1242
-
1243
- let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1244
- let override_ptr =
1245
- unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1246
-
1247
- assert!(!base_ptr.is_null());
1248
- assert!(!override_ptr.is_null());
1249
-
1250
- let base_ref = unsafe { &*base_ptr };
1251
- assert!(!base_ref.use_cache);
1252
-
1253
- let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
1254
- assert_eq!(result, 1);
1255
-
1256
- let base_ref = unsafe { &*base_ptr };
1257
- assert!(base_ref.use_cache, "override to default value should be applied");
1258
-
1259
- unsafe {
1260
- kreuzberg_config_free(base_ptr);
1261
- kreuzberg_config_free(override_ptr);
1262
- }
1263
- }
1264
-
1265
- #[test]
1266
- fn test_config_merge_override_force_ocr() {
1267
- let base_json = r#"{"force_ocr": false}"#;
1268
- let override_json = r#"{"force_ocr": true}"#;
1269
-
1270
- let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
1271
- let override_ptr =
1272
- unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
1273
-
1274
- assert!(!base_ptr.is_null());
1275
- assert!(!override_ptr.is_null());
1276
-
1277
- let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
1278
- assert_eq!(result, 1);
1279
-
1280
- let base_ref = unsafe { &*base_ptr };
1281
- assert!(base_ref.force_ocr);
1282
-
1283
- unsafe {
1284
- kreuzberg_config_free(base_ptr);
1285
- kreuzberg_config_free(override_ptr);
1286
- }
1287
- }
1288
-
1289
- #[test]
1290
- fn test_list_embedding_presets() {
1291
- let result = unsafe { kreuzberg_list_embedding_presets() };
1292
- assert!(!result.is_null());
1293
-
1294
- let presets_str = unsafe { CStr::from_ptr(result).to_str().unwrap() };
1295
- assert!(presets_str.starts_with('['));
1296
- assert!(presets_str.ends_with(']'));
1297
-
1298
- unsafe {
1299
- crate::kreuzberg_free_string(result);
1300
- }
1301
- }
1302
-
1303
- #[test]
1304
- fn test_get_embedding_preset_null() {
1305
- let result = unsafe { kreuzberg_get_embedding_preset(ptr::null()) };
1306
- assert!(result.is_null());
1307
- }
1308
-
1309
- #[test]
1310
- fn test_get_embedding_preset_unknown() {
1311
- let name = CString::new("nonexistent_preset").unwrap();
1312
- let result = unsafe { kreuzberg_get_embedding_preset(name.as_ptr()) };
1313
- assert!(result.is_null());
1314
- }
1315
-
1316
- #[test]
1317
- fn test_get_embedding_preset_valid() {
1318
- let name = CString::new("fast").unwrap();
1319
- let result = unsafe { kreuzberg_get_embedding_preset(name.as_ptr()) };
1320
- assert!(!result.is_null());
1321
-
1322
- let preset_str = unsafe { CStr::from_ptr(result).to_str().unwrap() };
1323
- assert!(preset_str.contains("name"));
1324
- assert!(preset_str.contains("chunk_size"));
1325
-
1326
- unsafe {
1327
- crate::kreuzberg_free_string(result);
1328
- }
1329
- }
1330
-
1331
- #[test]
1332
- fn test_config_discover_null_safe() {
1333
- let result = unsafe { kreuzberg_config_discover() };
1334
- // Result can be null if no config found, which is valid
1335
- if !result.is_null() {
1336
- unsafe {
1337
- crate::kreuzberg_free_string(result);
1338
- }
1339
- }
1340
- }
1341
- }