kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,598 @@
1
+ //! Builder pattern API for constructing ExtractionConfig programmatically.
2
+ //!
3
+ //! This module provides a step-by-step builder interface for language bindings
4
+ //! that prefer to construct configurations programmatically rather than via JSON.
5
+ //!
6
+ //! Unlike the JSON-based API in config.rs, this builder allows incremental
7
+ //! configuration construction with immediate validation at each step.
8
+
9
+ use crate::ffi_panic_guard;
10
+ use crate::ffi_panic_guard_i32;
11
+ use crate::helpers::{clear_last_error, set_last_error};
12
+ use kreuzberg::core::config::{
13
+ ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, PdfConfig,
14
+ PostProcessorConfig,
15
+ };
16
+ use std::ffi::{CStr, c_char};
17
+ use std::ptr;
18
+
19
+ /// Opaque builder struct for constructing ExtractionConfig.
20
+ ///
21
+ /// Use kreuzberg_config_builder_new() to create, set fields with setters,
22
+ /// then finalize with kreuzberg_config_builder_build().
23
+ pub struct ConfigBuilder {
24
+ config: ExtractionConfig,
25
+ }
26
+
27
+ impl ConfigBuilder {
28
+ fn new() -> Self {
29
+ Self {
30
+ config: ExtractionConfig::default(),
31
+ }
32
+ }
33
+
34
+ fn set_use_cache(&mut self, use_cache: bool) {
35
+ self.config.use_cache = use_cache;
36
+ }
37
+
38
+ fn set_ocr_from_json(&mut self, ocr_json: &str) -> Result<(), String> {
39
+ let ocr_config: OcrConfig =
40
+ serde_json::from_str(ocr_json).map_err(|e| format!("Failed to parse OCR config JSON: {}", e))?;
41
+ self.config.ocr = Some(ocr_config);
42
+ Ok(())
43
+ }
44
+
45
+ fn set_pdf_from_json(&mut self, pdf_json: &str) -> Result<(), String> {
46
+ let pdf_config: PdfConfig =
47
+ serde_json::from_str(pdf_json).map_err(|e| format!("Failed to parse PDF config JSON: {}", e))?;
48
+ self.config.pdf_options = Some(pdf_config);
49
+ Ok(())
50
+ }
51
+
52
+ fn set_chunking_from_json(&mut self, chunking_json: &str) -> Result<(), String> {
53
+ let chunking_config: ChunkingConfig =
54
+ serde_json::from_str(chunking_json).map_err(|e| format!("Failed to parse chunking config JSON: {}", e))?;
55
+ self.config.chunking = Some(chunking_config);
56
+ Ok(())
57
+ }
58
+
59
+ fn set_image_extraction_from_json(&mut self, image_json: &str) -> Result<(), String> {
60
+ let image_config: ImageExtractionConfig = serde_json::from_str(image_json)
61
+ .map_err(|e| format!("Failed to parse image extraction config JSON: {}", e))?;
62
+ self.config.images = Some(image_config);
63
+ Ok(())
64
+ }
65
+
66
+ fn set_post_processor_from_json(&mut self, pp_json: &str) -> Result<(), String> {
67
+ let pp_config: PostProcessorConfig =
68
+ serde_json::from_str(pp_json).map_err(|e| format!("Failed to parse post processor config JSON: {}", e))?;
69
+ self.config.postprocessor = Some(pp_config);
70
+ Ok(())
71
+ }
72
+
73
+ fn set_language_detection_from_json(&mut self, ld_json: &str) -> Result<(), String> {
74
+ let ld_config: LanguageDetectionConfig = serde_json::from_str(ld_json)
75
+ .map_err(|e| format!("Failed to parse language detection config JSON: {}", e))?;
76
+ self.config.language_detection = Some(ld_config);
77
+ Ok(())
78
+ }
79
+
80
+ fn build(self) -> ExtractionConfig {
81
+ self.config
82
+ }
83
+ }
84
+
85
+ /// Create a new config builder.
86
+ ///
87
+ /// Returns an opaque pointer to ConfigBuilder. Must be freed with
88
+ /// kreuzberg_config_builder_free() or consumed by kreuzberg_config_builder_build().
89
+ ///
90
+ /// # Safety
91
+ ///
92
+ /// The returned pointer must be freed with kreuzberg_config_builder_free()
93
+ /// or passed to kreuzberg_config_builder_build().
94
+ ///
95
+ /// # Example (C)
96
+ ///
97
+ /// ```c
98
+ /// ConfigBuilder* builder = kreuzberg_config_builder_new();
99
+ /// kreuzberg_config_builder_set_use_cache(builder, 1);
100
+ /// ExtractionConfig* config = kreuzberg_config_builder_build(builder);
101
+ /// // builder is now consumed, don't call kreuzberg_config_builder_free
102
+ /// kreuzberg_config_free(config);
103
+ /// ```
104
+ #[unsafe(no_mangle)]
105
+ pub unsafe extern "C" fn kreuzberg_config_builder_new() -> *mut ConfigBuilder {
106
+ ffi_panic_guard!("kreuzberg_config_builder_new", {
107
+ clear_last_error();
108
+ Box::into_raw(Box::new(ConfigBuilder::new()))
109
+ })
110
+ }
111
+
112
+ /// Set the use_cache field.
113
+ ///
114
+ /// # Arguments
115
+ ///
116
+ /// * `builder` - Non-null pointer to ConfigBuilder
117
+ /// * `use_cache` - 1 for true, 0 for false
118
+ ///
119
+ /// # Returns
120
+ ///
121
+ /// 0 on success, -1 on error (NULL builder)
122
+ ///
123
+ /// # Safety
124
+ ///
125
+ /// This function is meant to be called from C/FFI code. The caller must ensure:
126
+ /// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
127
+ /// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
128
+ #[unsafe(no_mangle)]
129
+ pub unsafe extern "C" fn kreuzberg_config_builder_set_use_cache(builder: *mut ConfigBuilder, use_cache: i32) -> i32 {
130
+ ffi_panic_guard_i32!("kreuzberg_config_builder_set_use_cache", {
131
+ if builder.is_null() {
132
+ set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
133
+ return -1;
134
+ }
135
+
136
+ clear_last_error();
137
+ unsafe { (*builder).set_use_cache(use_cache != 0) };
138
+ 0
139
+ })
140
+ }
141
+
142
+ /// Set OCR configuration from JSON.
143
+ ///
144
+ /// # Arguments
145
+ ///
146
+ /// * `builder` - Non-null pointer to ConfigBuilder
147
+ /// * `ocr_json` - JSON string like `{"backend": "tesseract", "languages": ["en"]}`
148
+ ///
149
+ /// # Returns
150
+ ///
151
+ /// 0 on success, -1 on error (check kreuzberg_last_error)
152
+ ///
153
+ /// # Safety
154
+ ///
155
+ /// This function is meant to be called from C/FFI code. The caller must ensure:
156
+ /// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
157
+ /// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
158
+ /// - `ocr_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
159
+ /// - The string pointer must remain valid for the duration of the function call
160
+ #[unsafe(no_mangle)]
161
+ pub unsafe extern "C" fn kreuzberg_config_builder_set_ocr(builder: *mut ConfigBuilder, ocr_json: *const c_char) -> i32 {
162
+ ffi_panic_guard_i32!("kreuzberg_config_builder_set_ocr", {
163
+ if builder.is_null() {
164
+ set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
165
+ return -1;
166
+ }
167
+ if ocr_json.is_null() {
168
+ set_last_error("OCR JSON cannot be NULL".to_string());
169
+ return -1;
170
+ }
171
+
172
+ clear_last_error();
173
+
174
+ let json_str = match unsafe { CStr::from_ptr(ocr_json) }.to_str() {
175
+ Ok(s) => s,
176
+ Err(e) => {
177
+ set_last_error(format!("Invalid UTF-8 in OCR JSON: {}", e));
178
+ return -1;
179
+ }
180
+ };
181
+
182
+ match unsafe { (*builder).set_ocr_from_json(json_str) } {
183
+ Ok(()) => 0,
184
+ Err(e) => {
185
+ set_last_error(e);
186
+ -1
187
+ }
188
+ }
189
+ })
190
+ }
191
+
192
+ /// Set PDF configuration from JSON.
193
+ ///
194
+ /// # Arguments
195
+ ///
196
+ /// * `builder` - Non-null pointer to ConfigBuilder
197
+ /// * `pdf_json` - JSON string for PDF config
198
+ ///
199
+ /// # Returns
200
+ ///
201
+ /// 0 on success, -1 on error
202
+ ///
203
+ /// # Safety
204
+ ///
205
+ /// This function is meant to be called from C/FFI code. The caller must ensure:
206
+ /// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
207
+ /// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
208
+ /// - `pdf_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
209
+ /// - The string pointer must remain valid for the duration of the function call
210
+ #[unsafe(no_mangle)]
211
+ pub unsafe extern "C" fn kreuzberg_config_builder_set_pdf(builder: *mut ConfigBuilder, pdf_json: *const c_char) -> i32 {
212
+ ffi_panic_guard_i32!("kreuzberg_config_builder_set_pdf", {
213
+ if builder.is_null() {
214
+ set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
215
+ return -1;
216
+ }
217
+ if pdf_json.is_null() {
218
+ set_last_error("PDF JSON cannot be NULL".to_string());
219
+ return -1;
220
+ }
221
+
222
+ clear_last_error();
223
+
224
+ let json_str = match unsafe { CStr::from_ptr(pdf_json) }.to_str() {
225
+ Ok(s) => s,
226
+ Err(e) => {
227
+ set_last_error(format!("Invalid UTF-8 in PDF JSON: {}", e));
228
+ return -1;
229
+ }
230
+ };
231
+
232
+ match unsafe { (*builder).set_pdf_from_json(json_str) } {
233
+ Ok(()) => 0,
234
+ Err(e) => {
235
+ set_last_error(e);
236
+ -1
237
+ }
238
+ }
239
+ })
240
+ }
241
+
242
+ /// Set chunking configuration from JSON.
243
+ ///
244
+ /// # Arguments
245
+ ///
246
+ /// * `builder` - Non-null pointer to ConfigBuilder
247
+ /// * `chunking_json` - JSON string for chunking config
248
+ ///
249
+ /// # Returns
250
+ ///
251
+ /// 0 on success, -1 on error
252
+ ///
253
+ /// # Safety
254
+ ///
255
+ /// This function is meant to be called from C/FFI code. The caller must ensure:
256
+ /// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
257
+ /// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
258
+ /// - `chunking_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
259
+ /// - The string pointer must remain valid for the duration of the function call
260
+ #[unsafe(no_mangle)]
261
+ pub unsafe extern "C" fn kreuzberg_config_builder_set_chunking(
262
+ builder: *mut ConfigBuilder,
263
+ chunking_json: *const c_char,
264
+ ) -> i32 {
265
+ ffi_panic_guard_i32!("kreuzberg_config_builder_set_chunking", {
266
+ if builder.is_null() {
267
+ set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
268
+ return -1;
269
+ }
270
+ if chunking_json.is_null() {
271
+ set_last_error("Chunking JSON cannot be NULL".to_string());
272
+ return -1;
273
+ }
274
+
275
+ clear_last_error();
276
+
277
+ let json_str = match unsafe { CStr::from_ptr(chunking_json) }.to_str() {
278
+ Ok(s) => s,
279
+ Err(e) => {
280
+ set_last_error(format!("Invalid UTF-8 in chunking JSON: {}", e));
281
+ return -1;
282
+ }
283
+ };
284
+
285
+ match unsafe { (*builder).set_chunking_from_json(json_str) } {
286
+ Ok(()) => 0,
287
+ Err(e) => {
288
+ set_last_error(e);
289
+ -1
290
+ }
291
+ }
292
+ })
293
+ }
294
+
295
+ /// Set image extraction configuration from JSON.
296
+ ///
297
+ /// # Arguments
298
+ ///
299
+ /// * `builder` - Non-null pointer to ConfigBuilder
300
+ /// * `image_json` - JSON string for image extraction config
301
+ ///
302
+ /// # Returns
303
+ ///
304
+ /// 0 on success, -1 on error
305
+ ///
306
+ /// # Safety
307
+ ///
308
+ /// This function is meant to be called from C/FFI code. The caller must ensure:
309
+ /// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
310
+ /// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
311
+ /// - `image_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
312
+ /// - The string pointer must remain valid for the duration of the function call
313
+ #[unsafe(no_mangle)]
314
+ pub unsafe extern "C" fn kreuzberg_config_builder_set_image_extraction(
315
+ builder: *mut ConfigBuilder,
316
+ image_json: *const c_char,
317
+ ) -> i32 {
318
+ ffi_panic_guard_i32!("kreuzberg_config_builder_set_image_extraction", {
319
+ if builder.is_null() {
320
+ set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
321
+ return -1;
322
+ }
323
+ if image_json.is_null() {
324
+ set_last_error("Image extraction JSON cannot be NULL".to_string());
325
+ return -1;
326
+ }
327
+
328
+ clear_last_error();
329
+
330
+ let json_str = match unsafe { CStr::from_ptr(image_json) }.to_str() {
331
+ Ok(s) => s,
332
+ Err(e) => {
333
+ set_last_error(format!("Invalid UTF-8 in image extraction JSON: {}", e));
334
+ return -1;
335
+ }
336
+ };
337
+
338
+ match unsafe { (*builder).set_image_extraction_from_json(json_str) } {
339
+ Ok(()) => 0,
340
+ Err(e) => {
341
+ set_last_error(e);
342
+ -1
343
+ }
344
+ }
345
+ })
346
+ }
347
+
348
+ /// Set post-processor configuration from JSON.
349
+ ///
350
+ /// # Arguments
351
+ ///
352
+ /// * `builder` - Non-null pointer to ConfigBuilder
353
+ /// * `pp_json` - JSON string for post-processor config
354
+ ///
355
+ /// # Returns
356
+ ///
357
+ /// 0 on success, -1 on error
358
+ ///
359
+ /// # Safety
360
+ ///
361
+ /// This function is meant to be called from C/FFI code. The caller must ensure:
362
+ /// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
363
+ /// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
364
+ /// - `pp_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
365
+ /// - The string pointer must remain valid for the duration of the function call
366
+ #[unsafe(no_mangle)]
367
+ pub unsafe extern "C" fn kreuzberg_config_builder_set_post_processor(
368
+ builder: *mut ConfigBuilder,
369
+ pp_json: *const c_char,
370
+ ) -> i32 {
371
+ ffi_panic_guard_i32!("kreuzberg_config_builder_set_post_processor", {
372
+ if builder.is_null() {
373
+ set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
374
+ return -1;
375
+ }
376
+ if pp_json.is_null() {
377
+ set_last_error("Post-processor JSON cannot be NULL".to_string());
378
+ return -1;
379
+ }
380
+
381
+ clear_last_error();
382
+
383
+ let json_str = match unsafe { CStr::from_ptr(pp_json) }.to_str() {
384
+ Ok(s) => s,
385
+ Err(e) => {
386
+ set_last_error(format!("Invalid UTF-8 in post-processor JSON: {}", e));
387
+ return -1;
388
+ }
389
+ };
390
+
391
+ match unsafe { (*builder).set_post_processor_from_json(json_str) } {
392
+ Ok(()) => 0,
393
+ Err(e) => {
394
+ set_last_error(e);
395
+ -1
396
+ }
397
+ }
398
+ })
399
+ }
400
+
401
+ /// Set language detection configuration from JSON.
402
+ ///
403
+ /// # Arguments
404
+ ///
405
+ /// * `builder` - Non-null pointer to ConfigBuilder
406
+ /// * `ld_json` - JSON string for language detection config
407
+ ///
408
+ /// # Returns
409
+ ///
410
+ /// 0 on success, -1 on error
411
+ ///
412
+ /// # Safety
413
+ ///
414
+ /// This function is meant to be called from C/FFI code. The caller must ensure:
415
+ /// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
416
+ /// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
417
+ /// - `ld_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
418
+ /// - The string pointer must remain valid for the duration of the function call
419
+ #[unsafe(no_mangle)]
420
+ pub unsafe extern "C" fn kreuzberg_config_builder_set_language_detection(
421
+ builder: *mut ConfigBuilder,
422
+ ld_json: *const c_char,
423
+ ) -> i32 {
424
+ ffi_panic_guard_i32!("kreuzberg_config_builder_set_language_detection", {
425
+ if builder.is_null() {
426
+ set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
427
+ return -1;
428
+ }
429
+ if ld_json.is_null() {
430
+ set_last_error("Language detection JSON cannot be NULL".to_string());
431
+ return -1;
432
+ }
433
+
434
+ clear_last_error();
435
+
436
+ let json_str = match unsafe { CStr::from_ptr(ld_json) }.to_str() {
437
+ Ok(s) => s,
438
+ Err(e) => {
439
+ set_last_error(format!("Invalid UTF-8 in language detection JSON: {}", e));
440
+ return -1;
441
+ }
442
+ };
443
+
444
+ match unsafe { (*builder).set_language_detection_from_json(json_str) } {
445
+ Ok(()) => 0,
446
+ Err(e) => {
447
+ set_last_error(e);
448
+ -1
449
+ }
450
+ }
451
+ })
452
+ }
453
+
454
+ /// Build the final ExtractionConfig and consume the builder.
455
+ ///
456
+ /// After calling this function, the builder pointer is invalid and must not be used.
457
+ /// The returned ExtractionConfig must be freed with kreuzberg_config_free().
458
+ ///
459
+ /// # Arguments
460
+ ///
461
+ /// * `builder` - Non-null pointer to ConfigBuilder (will be consumed)
462
+ ///
463
+ /// # Returns
464
+ ///
465
+ /// Pointer to ExtractionConfig on success, NULL on error
466
+ ///
467
+ /// # Safety
468
+ ///
469
+ /// - `builder` is consumed and must not be used after this call
470
+ /// - Do NOT call kreuzberg_config_builder_free() after this function
471
+ /// - The returned ExtractionConfig must be freed with kreuzberg_config_free()
472
+ #[unsafe(no_mangle)]
473
+ pub unsafe extern "C" fn kreuzberg_config_builder_build(builder: *mut ConfigBuilder) -> *mut ExtractionConfig {
474
+ ffi_panic_guard!("kreuzberg_config_builder_build", {
475
+ if builder.is_null() {
476
+ set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
477
+ return ptr::null_mut();
478
+ }
479
+
480
+ clear_last_error();
481
+ let builder_box = unsafe { Box::from_raw(builder) };
482
+ let config = builder_box.build();
483
+ Box::into_raw(Box::new(config))
484
+ })
485
+ }
486
+
487
+ /// Free a ConfigBuilder without building.
488
+ ///
489
+ /// Use this to discard a builder without creating a config.
490
+ /// Do NOT call this after kreuzberg_config_builder_build() (builder is already consumed).
491
+ ///
492
+ /// # Arguments
493
+ ///
494
+ /// * `builder` - Pointer to ConfigBuilder, can be NULL (no-op)
495
+ ///
496
+ /// # Safety
497
+ ///
498
+ /// - `builder` can be NULL (no-op)
499
+ /// - Do NOT call this after kreuzberg_config_builder_build()
500
+ #[unsafe(no_mangle)]
501
+ pub unsafe extern "C" fn kreuzberg_config_builder_free(builder: *mut ConfigBuilder) {
502
+ if !builder.is_null() {
503
+ unsafe { drop(Box::from_raw(builder)) };
504
+ }
505
+ }
506
+
507
+ #[cfg(test)]
508
+ mod tests {
509
+ use super::*;
510
+ use std::ffi::CString;
511
+
512
+ #[test]
513
+ fn test_builder_basic_flow() {
514
+ unsafe {
515
+ let builder = kreuzberg_config_builder_new();
516
+ assert!(!builder.is_null());
517
+
518
+ let result = kreuzberg_config_builder_set_use_cache(builder, 1);
519
+ assert_eq!(result, 0);
520
+
521
+ let config = kreuzberg_config_builder_build(builder);
522
+ assert!(!config.is_null());
523
+
524
+ assert!((*config).use_cache);
525
+
526
+ // Clean up
527
+ let _ = Box::from_raw(config);
528
+ }
529
+ }
530
+
531
+ #[test]
532
+ fn test_builder_with_ocr() {
533
+ unsafe {
534
+ let builder = kreuzberg_config_builder_new();
535
+ assert!(!builder.is_null());
536
+
537
+ let ocr_json = CString::new(r#"{"backend":"tesseract","languages":["en"]}"#).unwrap();
538
+ let result = kreuzberg_config_builder_set_ocr(builder, ocr_json.as_ptr());
539
+ assert_eq!(result, 0);
540
+
541
+ let config = kreuzberg_config_builder_build(builder);
542
+ assert!(!config.is_null());
543
+
544
+ assert!((*config).ocr.is_some());
545
+
546
+ // Clean up
547
+ let _ = Box::from_raw(config);
548
+ }
549
+ }
550
+
551
+ #[test]
552
+ fn test_builder_null_checks() {
553
+ unsafe {
554
+ // NULL builder should fail
555
+ let result = kreuzberg_config_builder_set_use_cache(ptr::null_mut(), 1);
556
+ assert_eq!(result, -1);
557
+
558
+ let config = kreuzberg_config_builder_build(ptr::null_mut());
559
+ assert!(config.is_null());
560
+ }
561
+ }
562
+
563
+ #[test]
564
+ fn test_builder_free() {
565
+ unsafe {
566
+ let builder = kreuzberg_config_builder_new();
567
+ assert!(!builder.is_null());
568
+
569
+ // Free without building should not crash
570
+ kreuzberg_config_builder_free(builder);
571
+
572
+ // Freeing NULL should not crash
573
+ kreuzberg_config_builder_free(ptr::null_mut());
574
+ }
575
+ }
576
+
577
+ #[test]
578
+ fn test_builder_invalid_json() {
579
+ unsafe {
580
+ let builder = kreuzberg_config_builder_new();
581
+ assert!(!builder.is_null());
582
+
583
+ let invalid_json = CString::new("not valid json").unwrap();
584
+ let result = kreuzberg_config_builder_set_ocr(builder, invalid_json.as_ptr());
585
+ assert_eq!(result, -1);
586
+
587
+ // Builder should still be usable
588
+ let result = kreuzberg_config_builder_set_use_cache(builder, 0);
589
+ assert_eq!(result, 0);
590
+
591
+ let config = kreuzberg_config_builder_build(builder);
592
+ assert!(!config.is_null());
593
+
594
+ // Clean up
595
+ let _ = Box::from_raw(config);
596
+ }
597
+ }
598
+ }
@@ -486,22 +486,54 @@ pub extern "C" fn kreuzberg_get_error_details() -> CErrorDetails {
486
486
  (None, None, 0)
487
487
  };
488
488
 
489
+ // Helper to convert string to C string with proper error handling.
490
+ // On failure, logs the error and returns a fallback heap-allocated string.
491
+ fn string_to_cstring_with_fallback(value: String, fallback: &str, field_name: &str) -> *mut c_char {
492
+ match CString::new(value) {
493
+ Ok(cstr) => cstr.into_raw(),
494
+ Err(e) => {
495
+ log::warn!(
496
+ "kreuzberg_get_error_details: CString creation failed for {}: {} (contains interior NUL byte)",
497
+ field_name,
498
+ e
499
+ );
500
+ // Allocate a proper CString for the fallback so it can be safely freed
501
+ CString::new(fallback).map(CString::into_raw).unwrap_or_else(|_| {
502
+ // This should never happen since fallback is a static string without NUL bytes
503
+ log::warn!(
504
+ "kreuzberg_get_error_details: CRITICAL - fallback CString creation also failed for {}",
505
+ field_name
506
+ );
507
+ ptr::null_mut()
508
+ })
509
+ }
510
+ }
511
+ }
512
+
513
+ // Helper for optional string fields (accepts &str to match panic context types)
514
+ fn optional_str_to_cstring(value: Option<&str>, field_name: &str) -> *mut c_char {
515
+ match value {
516
+ Some(s) => match CString::new(s) {
517
+ Ok(cstr) => cstr.into_raw(),
518
+ Err(e) => {
519
+ log::warn!(
520
+ "kreuzberg_get_error_details: CString creation failed for {}: {} (contains interior NUL byte)",
521
+ field_name,
522
+ e
523
+ );
524
+ ptr::null_mut()
525
+ }
526
+ },
527
+ None => ptr::null_mut(),
528
+ }
529
+ }
530
+
489
531
  CErrorDetails {
490
- message: CString::new(message)
491
- .map(CString::into_raw)
492
- .unwrap_or_else(|_| "Error message creation failed".as_ptr() as *mut c_char),
532
+ message: string_to_cstring_with_fallback(message, "CString error", "message"),
493
533
  error_code,
494
- error_type: CString::new(error_type)
495
- .map(CString::into_raw)
496
- .unwrap_or_else(|_| "unknown".as_ptr() as *mut c_char),
497
- source_file: source_file
498
- .and_then(|f| CString::new(f).ok())
499
- .map(CString::into_raw)
500
- .unwrap_or(ptr::null_mut()),
501
- source_function: source_function
502
- .and_then(|f| CString::new(f).ok())
503
- .map(CString::into_raw)
504
- .unwrap_or(ptr::null_mut()),
534
+ error_type: string_to_cstring_with_fallback(error_type, "unknown", "error_type"),
535
+ source_file: optional_str_to_cstring(source_file, "source_file"),
536
+ source_function: optional_str_to_cstring(source_function, "source_function"),
505
537
  source_line,
506
538
  context_info: ptr::null_mut(),
507
539
  is_panic,