kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -25,6 +25,14 @@ typedef struct ExtractionConfig ExtractionConfig;
25
25
  typedef struct ExtractionResult ExtractionResult;
26
26
 
27
27
 
28
+ /**
29
+ * Opaque builder struct for constructing ExtractionConfig.
30
+ *
31
+ * Use kreuzberg_config_builder_new() to create, set fields with setters,
32
+ * then finalize with kreuzberg_config_builder_build().
33
+ */
34
+ typedef struct ConfigBuilder ConfigBuilder;
35
+
28
36
  typedef struct Option_ErrorCallback Option_ErrorCallback;
29
37
 
30
38
  /**
@@ -658,22 +666,6 @@ int kreuzberg_extract_batch_parallel(const char *const *files,
658
666
  * - `json_config` must be a valid null-terminated C string
659
667
  * - The returned pointer must be freed with `kreuzberg_config_free`
660
668
  * - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
661
- *
662
- * # Example (C)
663
- *
664
- * ```c
665
- * const char* config_json = "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}";
666
- * ExtractionConfig* config = kreuzberg_config_from_json(config_json);
667
- * if (config == NULL) {
668
- * printf("Error: %s\n", kreuzberg_last_error());
669
- * return 1;
670
- * }
671
- *
672
- * // Use config...
673
- * // char* result = kreuzberg_extract_file_with_config("doc.pdf", config);
674
- *
675
- * kreuzberg_config_free(config);
676
- * ```
677
669
  */
678
670
  ExtractionConfig *kreuzberg_config_from_json(const char *json_config);
679
671
 
@@ -685,30 +677,12 @@ ExtractionConfig *kreuzberg_config_from_json(const char *json_config);
685
677
  * - `config` must be a pointer previously returned by a config creation function
686
678
  * - `config` can be NULL (no-op)
687
679
  * - `config` must not be used after this call
688
- *
689
- * # Example (C)
690
- *
691
- * ```c
692
- * ExtractionConfig* config = kreuzberg_config_from_json("{...}");
693
- * if (config != NULL) {
694
- * // Use config...
695
- * kreuzberg_config_free(config);
696
- * }
697
- * ```
698
680
  */
699
681
  void kreuzberg_config_free(ExtractionConfig *config);
700
682
 
701
683
  /**
702
684
  * Validate a JSON config string without parsing it.
703
685
  *
704
- * This function checks if a JSON config string is valid and would parse correctly,
705
- * without allocating the full ExtractionConfig structure. Useful for validation
706
- * before committing to parsing.
707
- *
708
- * # Arguments
709
- *
710
- * * `json_config` - Null-terminated C string containing JSON configuration
711
- *
712
686
  * # Returns
713
687
  *
714
688
  * - 1 if valid (would parse successfully)
@@ -717,238 +691,312 @@ void kreuzberg_config_free(ExtractionConfig *config);
717
691
  * # Safety
718
692
  *
719
693
  * - `json_config` must be a valid null-terminated C string
720
- *
721
- * # Example (C)
722
- *
723
- * ```c
724
- * const char* config_json = "{\"use_cache\": true}";
725
- * if (kreuzberg_config_is_valid(config_json)) {
726
- * ExtractionConfig* config = kreuzberg_config_from_json(config_json);
727
- * // Use config...
728
- * kreuzberg_config_free(config);
729
- * } else {
730
- * printf("Invalid config: %s\n", kreuzberg_last_error());
731
- * }
732
- * ```
733
694
  */
734
695
  int32_t kreuzberg_config_is_valid(const char *json_config);
735
696
 
736
697
  /**
737
698
  * Serialize an ExtractionConfig to JSON string.
738
699
  *
739
- * Converts an ExtractionConfig structure to its JSON representation, allowing
740
- * bindings to serialize configs without reimplementing serialization logic.
700
+ * # Safety
741
701
  *
742
- * # Arguments
702
+ * - `config` must be a valid pointer to an ExtractionConfig
703
+ * - The returned pointer must be freed with `kreuzberg_free_string`
704
+ */
705
+ char *kreuzberg_config_to_json(const ExtractionConfig *config);
706
+
707
+ /**
708
+ * Get a specific field from config as JSON string.
743
709
  *
744
- * * `config` - Pointer to an ExtractionConfig structure
710
+ * # Safety
711
+ *
712
+ * - `config` must be a valid pointer to an ExtractionConfig
713
+ * - `field_name` must be a valid null-terminated C string
714
+ */
715
+ char *kreuzberg_config_get_field(const ExtractionConfig *config, const char *field_name);
716
+
717
+ /**
718
+ * Merge two configs (override takes precedence over base).
745
719
  *
746
720
  * # Returns
747
721
  *
748
- * A pointer to a C string containing JSON that MUST be freed with `kreuzberg_free_string`.
749
- * Returns NULL on error (check `kreuzberg_last_error`).
722
+ * - 1 on success
723
+ * - 0 on error (check `kreuzberg_last_error`)
750
724
  *
751
725
  * # Safety
752
726
  *
753
- * - `config` must be a valid pointer to an ExtractionConfig
754
- * - `config` cannot be NULL
755
- * - The returned pointer must be freed with `kreuzberg_free_string`
727
+ * - `base` must be a valid mutable pointer to an ExtractionConfig
728
+ * - `override_config` must be a valid pointer to an ExtractionConfig
729
+ */
730
+ int32_t kreuzberg_config_merge(ExtractionConfig *base, const ExtractionConfig *override_config);
731
+
732
+ /**
733
+ * Load an ExtractionConfig from a file (returns JSON string).
756
734
  *
757
- * # Example (C)
735
+ * # Safety
758
736
  *
759
- * ```c
760
- * ExtractionConfig* config = kreuzberg_config_from_json("{\"use_cache\": true}");
761
- * if (config != NULL) {
762
- * char* json = kreuzberg_config_to_json(config);
763
- * if (json != NULL) {
764
- * printf("Serialized: %s\n", json);
765
- * kreuzberg_free_string(json);
766
- * }
767
- * kreuzberg_config_free(config);
768
- * }
769
- * ```
737
+ * - `file_path` must be a valid null-terminated C string
738
+ * - The returned string must be freed with `kreuzberg_free_string`
770
739
  */
771
- char *kreuzberg_config_to_json(const ExtractionConfig *config);
740
+ char *kreuzberg_load_extraction_config_from_file(const char *file_path);
772
741
 
773
742
  /**
774
- * Get a specific field from config as JSON string.
743
+ * Load an ExtractionConfig from a file (returns pointer to config struct).
775
744
  *
776
- * Retrieves a nested field from the configuration by path and returns its JSON
777
- * representation. Supports dot notation for nested fields (e.g., "ocr.backend").
745
+ * # Safety
778
746
  *
779
- * # Arguments
747
+ * - `path` must be a valid null-terminated C string
748
+ * - The returned pointer must be freed with `kreuzberg_config_free`
749
+ */
750
+ ExtractionConfig *kreuzberg_config_from_file(const char *path);
751
+
752
+ /**
753
+ * Discover and load an ExtractionConfig by searching parent directories.
780
754
  *
781
- * * `config` - Pointer to an ExtractionConfig structure
782
- * * `field_name` - Null-terminated C string with field path (e.g., "use_cache", "ocr.backend")
755
+ * # Safety
783
756
  *
784
- * # Returns
757
+ * - The returned string must be freed with `kreuzberg_free_string`
758
+ */
759
+ char *kreuzberg_config_discover(void);
760
+
761
+ /**
762
+ * List available embedding preset names.
785
763
  *
786
- * A pointer to a C string containing the field value as JSON, or NULL if:
787
- * - The field doesn't exist
788
- * - An error occurs during serialization
764
+ * # Safety
789
765
  *
790
- * The returned pointer (if non-NULL) must be freed with `kreuzberg_free_string`.
766
+ * - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
767
+ */
768
+ char *kreuzberg_list_embedding_presets(void);
769
+
770
+ /**
771
+ * Get a specific embedding preset by name.
791
772
  *
792
773
  * # Safety
793
774
  *
794
- * - `config` must be a valid pointer to an ExtractionConfig
795
- * - `field_name` must be a valid null-terminated C string
796
- * - Neither parameter can be NULL
775
+ * - `name` must be a valid null-terminated C string
776
+ * - Returned string is JSON object and must be freed with `kreuzberg_free_string`
777
+ */
778
+ char *kreuzberg_get_embedding_preset(const char *name);
779
+
780
+ /**
781
+ * Create a new config builder.
797
782
  *
798
- * # Example (C)
783
+ * Returns an opaque pointer to ConfigBuilder. Must be freed with
784
+ * kreuzberg_config_builder_free() or consumed by kreuzberg_config_builder_build().
799
785
  *
800
- * ```c
801
- * ExtractionConfig* config = kreuzberg_config_from_json(
802
- * "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}"
803
- * );
804
- * if (config != NULL) {
805
- * char* use_cache = kreuzberg_config_get_field(config, "use_cache");
806
- * char* backend = kreuzberg_config_get_field(config, "ocr.backend");
786
+ * # Safety
807
787
  *
808
- * if (use_cache != NULL) {
809
- * printf("use_cache: %s\n", use_cache);
810
- * kreuzberg_free_string(use_cache);
811
- * }
788
+ * The returned pointer must be freed with kreuzberg_config_builder_free()
789
+ * or passed to kreuzberg_config_builder_build().
812
790
  *
813
- * if (backend != NULL) {
814
- * printf("backend: %s\n", backend);
815
- * kreuzberg_free_string(backend);
816
- * }
791
+ * # Example (C)
817
792
  *
818
- * kreuzberg_config_free(config);
819
- * }
793
+ * ```c
794
+ * ConfigBuilder* builder = kreuzberg_config_builder_new();
795
+ * kreuzberg_config_builder_set_use_cache(builder, 1);
796
+ * ExtractionConfig* config = kreuzberg_config_builder_build(builder);
797
+ * // builder is now consumed, don't call kreuzberg_config_builder_free
798
+ * kreuzberg_config_free(config);
820
799
  * ```
821
800
  */
822
- char *kreuzberg_config_get_field(const ExtractionConfig *config, const char *field_name);
801
+ struct ConfigBuilder *kreuzberg_config_builder_new(void);
823
802
 
824
803
  /**
825
- * Merge two configs (override takes precedence over base).
826
- *
827
- * Performs a shallow merge of two ExtractionConfig structures, where fields
828
- * from `override_config` take precedence over fields in `base`. The `base`
829
- * config is modified in-place.
804
+ * Set the use_cache field.
830
805
  *
831
806
  * # Arguments
832
807
  *
833
- * * `base` - Pointer to the base ExtractionConfig (will be modified)
834
- * * `override_config` - Pointer to the override ExtractionConfig (read-only)
808
+ * * `builder` - Non-null pointer to ConfigBuilder
809
+ * * `use_cache` - 1 for true, 0 for false
835
810
  *
836
811
  * # Returns
837
812
  *
838
- * - 1 on success
839
- * - 0 on error (check `kreuzberg_last_error`)
813
+ * 0 on success, -1 on error (NULL builder)
840
814
  *
841
815
  * # Safety
842
816
  *
843
- * - `base` must be a valid mutable pointer to an ExtractionConfig
844
- * - `override_config` must be a valid pointer to an ExtractionConfig
845
- * - Neither parameter can be NULL
846
- * - `base` is modified in-place
817
+ * This function is meant to be called from C/FFI code. The caller must ensure:
818
+ * - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
819
+ * - The pointer must be properly aligned and point to a valid ConfigBuilder instance
820
+ */
821
+ int32_t kreuzberg_config_builder_set_use_cache(struct ConfigBuilder *builder,
822
+ int32_t use_cache);
823
+
824
+ /**
825
+ * Set OCR configuration from JSON.
847
826
  *
848
- * # Example (C)
827
+ * # Arguments
849
828
  *
850
- * ```c
851
- * ExtractionConfig* base = kreuzberg_config_from_json(
852
- * "{\"use_cache\": true, \"force_ocr\": false}"
853
- * );
854
- * ExtractionConfig* override = kreuzberg_config_from_json(
855
- * "{\"force_ocr\": true}"
856
- * );
829
+ * * `builder` - Non-null pointer to ConfigBuilder
830
+ * * `ocr_json` - JSON string like `{"backend": "tesseract", "languages": ["en"]}`
857
831
  *
858
- * if (kreuzberg_config_merge(base, override) == 1) {
859
- * // base now has: use_cache=true, force_ocr=true
860
- * char* json = kreuzberg_config_to_json(base);
861
- * printf("Merged config: %s\n", json);
862
- * kreuzberg_free_string(json);
863
- * }
832
+ * # Returns
864
833
  *
865
- * kreuzberg_config_free(base);
866
- * kreuzberg_config_free(override);
867
- * ```
834
+ * 0 on success, -1 on error (check kreuzberg_last_error)
835
+ *
836
+ * # Safety
837
+ *
838
+ * This function is meant to be called from C/FFI code. The caller must ensure:
839
+ * - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
840
+ * - The pointer must be properly aligned and point to a valid ConfigBuilder instance
841
+ * - `ocr_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
842
+ * - The string pointer must remain valid for the duration of the function call
868
843
  */
869
- int32_t kreuzberg_config_merge(ExtractionConfig *base, const ExtractionConfig *override_config);
844
+ int32_t kreuzberg_config_builder_set_ocr(struct ConfigBuilder *builder,
845
+ const char *ocr_json);
870
846
 
871
847
  /**
872
- * Load an ExtractionConfig from a file.
848
+ * Set PDF configuration from JSON.
849
+ *
850
+ * # Arguments
851
+ *
852
+ * * `builder` - Non-null pointer to ConfigBuilder
853
+ * * `pdf_json` - JSON string for PDF config
854
+ *
855
+ * # Returns
873
856
  *
874
- * Returns a JSON string representing the loaded configuration.
857
+ * 0 on success, -1 on error
875
858
  *
876
859
  * # Safety
877
860
  *
878
- * - `file_path` must be a valid null-terminated C string
879
- * - The returned string must be freed with `kreuzberg_free_string`
880
- * - Returns NULL on error (check `kreuzberg_last_error`)
861
+ * This function is meant to be called from C/FFI code. The caller must ensure:
862
+ * - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
863
+ * - The pointer must be properly aligned and point to a valid ConfigBuilder instance
864
+ * - `pdf_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
865
+ * - The string pointer must remain valid for the duration of the function call
881
866
  */
882
- char *kreuzberg_load_extraction_config_from_file(const char *file_path);
867
+ int32_t kreuzberg_config_builder_set_pdf(struct ConfigBuilder *builder,
868
+ const char *pdf_json);
883
869
 
884
870
  /**
885
- * Load an ExtractionConfig from a file (returns pointer to config struct).
871
+ * Set chunking configuration from JSON.
872
+ *
873
+ * # Arguments
874
+ *
875
+ * * `builder` - Non-null pointer to ConfigBuilder
876
+ * * `chunking_json` - JSON string for chunking config
877
+ *
878
+ * # Returns
879
+ *
880
+ * 0 on success, -1 on error
886
881
  *
887
882
  * # Safety
888
883
  *
889
- * - `path` must be a valid null-terminated C string
890
- * - The returned pointer must be freed with `kreuzberg_config_free`
891
- * - Returns NULL on error (check `kreuzberg_last_error`)
884
+ * This function is meant to be called from C/FFI code. The caller must ensure:
885
+ * - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
886
+ * - The pointer must be properly aligned and point to a valid ConfigBuilder instance
887
+ * - `chunking_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
888
+ * - The string pointer must remain valid for the duration of the function call
889
+ */
890
+ int32_t kreuzberg_config_builder_set_chunking(struct ConfigBuilder *builder,
891
+ const char *chunking_json);
892
+
893
+ /**
894
+ * Set image extraction configuration from JSON.
892
895
  *
893
- * # Example (C)
896
+ * # Arguments
894
897
  *
895
- * ```c
896
- * ExtractionConfig* config = kreuzberg_config_from_file("config.toml");
897
- * if (config == NULL) {
898
- * printf("Error: %s\n", kreuzberg_last_error());
899
- * return 1;
900
- * }
901
- * kreuzberg_config_free(config);
902
- * ```
898
+ * * `builder` - Non-null pointer to ConfigBuilder
899
+ * * `image_json` - JSON string for image extraction config
900
+ *
901
+ * # Returns
902
+ *
903
+ * 0 on success, -1 on error
904
+ *
905
+ * # Safety
906
+ *
907
+ * This function is meant to be called from C/FFI code. The caller must ensure:
908
+ * - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
909
+ * - The pointer must be properly aligned and point to a valid ConfigBuilder instance
910
+ * - `image_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
911
+ * - The string pointer must remain valid for the duration of the function call
903
912
  */
904
- ExtractionConfig *kreuzberg_config_from_file(const char *path);
913
+ int32_t kreuzberg_config_builder_set_image_extraction(struct ConfigBuilder *builder,
914
+ const char *image_json);
905
915
 
906
916
  /**
907
- * Discover and load an ExtractionConfig by searching parent directories.
917
+ * Set post-processor configuration from JSON.
908
918
  *
909
- * Searches the current directory and all parent directories for:
910
- * - `kreuzberg.toml`
911
- * - `kreuzberg.json`
919
+ * # Arguments
912
920
  *
913
- * Returns the first config file found as a JSON string.
921
+ * * `builder` - Non-null pointer to ConfigBuilder
922
+ * * `pp_json` - JSON string for post-processor config
923
+ *
924
+ * # Returns
925
+ *
926
+ * 0 on success, -1 on error
914
927
  *
915
928
  * # Safety
916
929
  *
917
- * - The returned string must be freed with `kreuzberg_free_string`
918
- * - Returns NULL if no config is found or on error
930
+ * This function is meant to be called from C/FFI code. The caller must ensure:
931
+ * - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
932
+ * - The pointer must be properly aligned and point to a valid ConfigBuilder instance
933
+ * - `pp_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
934
+ * - The string pointer must remain valid for the duration of the function call
935
+ */
936
+ int32_t kreuzberg_config_builder_set_post_processor(struct ConfigBuilder *builder,
937
+ const char *pp_json);
938
+
939
+ /**
940
+ * Set language detection configuration from JSON.
919
941
  *
920
- * # Example (C)
942
+ * # Arguments
921
943
  *
922
- * ```c
923
- * char* config_json = kreuzberg_config_discover();
924
- * if (config_json != NULL) {
925
- * printf("Discovered config: %s\n", config_json);
926
- * kreuzberg_free_string(config_json);
927
- * }
928
- * ```
944
+ * * `builder` - Non-null pointer to ConfigBuilder
945
+ * * `ld_json` - JSON string for language detection config
946
+ *
947
+ * # Returns
948
+ *
949
+ * 0 on success, -1 on error
950
+ *
951
+ * # Safety
952
+ *
953
+ * This function is meant to be called from C/FFI code. The caller must ensure:
954
+ * - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
955
+ * - The pointer must be properly aligned and point to a valid ConfigBuilder instance
956
+ * - `ld_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
957
+ * - The string pointer must remain valid for the duration of the function call
929
958
  */
930
- char *kreuzberg_config_discover(void);
959
+ int32_t kreuzberg_config_builder_set_language_detection(struct ConfigBuilder *builder,
960
+ const char *ld_json);
931
961
 
932
962
  /**
933
- * List available embedding preset names.
963
+ * Build the final ExtractionConfig and consume the builder.
964
+ *
965
+ * After calling this function, the builder pointer is invalid and must not be used.
966
+ * The returned ExtractionConfig must be freed with kreuzberg_config_free().
967
+ *
968
+ * # Arguments
969
+ *
970
+ * * `builder` - Non-null pointer to ConfigBuilder (will be consumed)
971
+ *
972
+ * # Returns
973
+ *
974
+ * Pointer to ExtractionConfig on success, NULL on error
934
975
  *
935
976
  * # Safety
936
977
  *
937
- * - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
938
- * - Returns NULL on error (check `kreuzberg_last_error`)
978
+ * - `builder` is consumed and must not be used after this call
979
+ * - Do NOT call kreuzberg_config_builder_free() after this function
980
+ * - The returned ExtractionConfig must be freed with kreuzberg_config_free()
939
981
  */
940
- char *kreuzberg_list_embedding_presets(void);
982
+ ExtractionConfig *kreuzberg_config_builder_build(struct ConfigBuilder *builder);
941
983
 
942
984
  /**
943
- * Get a specific embedding preset by name.
985
+ * Free a ConfigBuilder without building.
986
+ *
987
+ * Use this to discard a builder without creating a config.
988
+ * Do NOT call this after kreuzberg_config_builder_build() (builder is already consumed).
989
+ *
990
+ * # Arguments
991
+ *
992
+ * * `builder` - Pointer to ConfigBuilder, can be NULL (no-op)
944
993
  *
945
994
  * # Safety
946
995
  *
947
- * - `name` must be a valid null-terminated C string
948
- * - Returned string is JSON object and must be freed with `kreuzberg_free_string`
949
- * - Returns NULL on error (check `kreuzberg_last_error`)
996
+ * - `builder` can be NULL (no-op)
997
+ * - Do NOT call this after kreuzberg_config_builder_build()
950
998
  */
951
- char *kreuzberg_get_embedding_preset(const char *name);
999
+ void kreuzberg_config_builder_free(struct ConfigBuilder *builder);
952
1000
 
953
1001
  /**
954
1002
  * Returns the validation error code (0).
@@ -1370,6 +1418,127 @@ struct CBatchResult *kreuzberg_batch_extract_bytes_sync(const struct CBytesWithM
1370
1418
  uintptr_t count,
1371
1419
  const char *config_json);
1372
1420
 
1421
+ /**
1422
+ * Parse HeadingStyle from string to discriminant.
1423
+ *
1424
+ * Valid values: "atx", "underlined", "atx_closed" | "atx-closed"
1425
+ * Returns: 0 = Atx, 1 = Underlined, 2 = AtxClosed, -1 = Invalid
1426
+ *
1427
+ * # Safety
1428
+ *
1429
+ * - `value` must be a valid null-terminated C string or NULL
1430
+ */
1431
+ int32_t kreuzberg_parse_heading_style(const char *value);
1432
+
1433
+ /**
1434
+ * Convert HeadingStyle discriminant to string.
1435
+ *
1436
+ * Returns: pointer to static string, or NULL for invalid discriminant
1437
+ */
1438
+ const char *kreuzberg_heading_style_to_string(int32_t discriminant);
1439
+
1440
+ /**
1441
+ * Parse CodeBlockStyle from string to discriminant.
1442
+ *
1443
+ * Valid values: "indented", "backticks", "tildes"
1444
+ * Returns: 0 = Indented, 1 = Backticks, 2 = Tildes, -1 = Invalid
1445
+ *
1446
+ * # Safety
1447
+ *
1448
+ * - `value` must be a valid null-terminated C string or NULL
1449
+ */
1450
+ int32_t kreuzberg_parse_code_block_style(const char *value);
1451
+
1452
+ /**
1453
+ * Convert CodeBlockStyle discriminant to string.
1454
+ */
1455
+ const char *kreuzberg_code_block_style_to_string(int32_t discriminant);
1456
+
1457
+ /**
1458
+ * Parse HighlightStyle from string to discriminant.
1459
+ *
1460
+ * Valid values: "double_equal" | "==" | "double-equal", "html", "bold", "none"
1461
+ * Returns: 0 = DoubleEqual, 1 = Html, 2 = Bold, 3 = None, -1 = Invalid
1462
+ *
1463
+ * # Safety
1464
+ *
1465
+ * - `value` must be a valid null-terminated C string or NULL
1466
+ */
1467
+ int32_t kreuzberg_parse_highlight_style(const char *value);
1468
+
1469
+ /**
1470
+ * Convert HighlightStyle discriminant to string.
1471
+ */
1472
+ const char *kreuzberg_highlight_style_to_string(int32_t discriminant);
1473
+
1474
+ /**
1475
+ * Parse ListIndentType from string to discriminant.
1476
+ *
1477
+ * Valid values: "spaces", "tabs"
1478
+ * Returns: 0 = Spaces, 1 = Tabs, -1 = Invalid
1479
+ *
1480
+ * # Safety
1481
+ *
1482
+ * - `value` must be a valid null-terminated C string or NULL
1483
+ */
1484
+ int32_t kreuzberg_parse_list_indent_type(const char *value);
1485
+
1486
+ /**
1487
+ * Convert ListIndentType discriminant to string.
1488
+ */
1489
+ const char *kreuzberg_list_indent_type_to_string(int32_t discriminant);
1490
+
1491
+ /**
1492
+ * Parse WhitespaceMode from string to discriminant.
1493
+ *
1494
+ * Valid values: "default", "preserve", "preserve_inner", "collapse"
1495
+ * Returns: 0 = Default, 1 = Preserve, 2 = PreserveInner, 3 = Collapse, -1 = Invalid
1496
+ *
1497
+ * # Safety
1498
+ *
1499
+ * - `value` must be a valid null-terminated C string or NULL
1500
+ */
1501
+ int32_t kreuzberg_parse_whitespace_mode(const char *value);
1502
+
1503
+ /**
1504
+ * Convert WhitespaceMode discriminant to string.
1505
+ */
1506
+ const char *kreuzberg_whitespace_mode_to_string(int32_t discriminant);
1507
+
1508
+ /**
1509
+ * Parse NewlineStyle from string to discriminant.
1510
+ *
1511
+ * Valid values: "default", "spaces", "backslash"
1512
+ * Returns: 0 = Default, 1 = Spaces, 2 = Backslash, -1 = Invalid
1513
+ *
1514
+ * # Safety
1515
+ *
1516
+ * - `value` must be a valid null-terminated C string or NULL
1517
+ */
1518
+ int32_t kreuzberg_parse_newline_style(const char *value);
1519
+
1520
+ /**
1521
+ * Convert NewlineStyle discriminant to string.
1522
+ */
1523
+ const char *kreuzberg_newline_style_to_string(int32_t discriminant);
1524
+
1525
+ /**
1526
+ * Parse PreprocessingPreset from string to discriminant.
1527
+ *
1528
+ * Valid values: "none", "conservative", "aggressive"
1529
+ * Returns: 0 = None, 1 = Conservative, 2 = Aggressive, -1 = Invalid
1530
+ *
1531
+ * # Safety
1532
+ *
1533
+ * - `value` must be a valid null-terminated C string or NULL
1534
+ */
1535
+ int32_t kreuzberg_parse_preprocessing_preset(const char *value);
1536
+
1537
+ /**
1538
+ * Convert PreprocessingPreset discriminant to string.
1539
+ */
1540
+ const char *kreuzberg_preprocessing_preset_to_string(int32_t discriminant);
1541
+
1373
1542
  /**
1374
1543
  * Free a batch result returned by batch extraction functions.
1375
1544
  *