kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -66,6 +66,8 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
66
66
  chunks,
67
67
  images,
68
68
  pages,
69
+ djot_content: _,
70
+ elements: _,
69
71
  } = result;
70
72
 
71
73
  let sanitized_content = if content.contains('\0') {
@@ -345,6 +347,8 @@ mod tests {
345
347
  chunks: None,
346
348
  images: None,
347
349
  pages: None,
350
+ djot_content: None,
351
+ elements: None,
348
352
  };
349
353
 
350
354
  let c_result = to_c_extraction_result(result);
@@ -382,6 +386,8 @@ mod tests {
382
386
  chunks: None,
383
387
  images: None,
384
388
  pages: None,
389
+ djot_content: None,
390
+ elements: None,
385
391
  };
386
392
 
387
393
  let c_result = to_c_extraction_result(result);
@@ -429,6 +435,8 @@ mod tests {
429
435
  chunks: None,
430
436
  images: None,
431
437
  pages: None,
438
+ djot_content: None,
439
+ elements: None,
432
440
  };
433
441
 
434
442
  let c_result = to_c_extraction_result(result);
@@ -506,6 +514,8 @@ mod tests {
506
514
  chunks: Some(vec![chunk]),
507
515
  images: None,
508
516
  pages: None,
517
+ djot_content: None,
518
+ elements: None,
509
519
  };
510
520
 
511
521
  let c_result = to_c_extraction_result(result);
@@ -0,0 +1,421 @@
1
+ //! Shared FFI functions for parsing HTML-to-Markdown conversion options.
2
+ //!
3
+ //! This module provides C FFI functions for parsing enum values from strings,
4
+ //! eliminating duplication across language bindings (Node.js, Python, Ruby).
5
+ //! Each enum parser returns an i32 discriminant (or -1 for invalid input).
6
+
7
+ use std::ffi::{CStr, c_char};
8
+ use std::ptr;
9
+
10
+ /// Parse HeadingStyle from string to discriminant.
11
+ ///
12
+ /// Valid values: "atx", "underlined", "atx_closed" | "atx-closed"
13
+ /// Returns: 0 = Atx, 1 = Underlined, 2 = AtxClosed, -1 = Invalid
14
+ ///
15
+ /// # Safety
16
+ ///
17
+ /// - `value` must be a valid null-terminated C string or NULL
18
+ #[unsafe(no_mangle)]
19
+ pub unsafe extern "C" fn kreuzberg_parse_heading_style(value: *const c_char) -> i32 {
20
+ if value.is_null() {
21
+ return -1;
22
+ }
23
+
24
+ let c_str = match unsafe { CStr::from_ptr(value) }.to_str() {
25
+ Ok(s) => s,
26
+ Err(_) => return -1,
27
+ };
28
+
29
+ match c_str.to_lowercase().as_str() {
30
+ "atx" => 0,
31
+ "underlined" => 1,
32
+ "atx_closed" | "atx-closed" => 2,
33
+ _ => -1,
34
+ }
35
+ }
36
+
37
+ /// Convert HeadingStyle discriminant to string.
38
+ ///
39
+ /// Returns: pointer to static string, or NULL for invalid discriminant
40
+ #[unsafe(no_mangle)]
41
+ pub extern "C" fn kreuzberg_heading_style_to_string(discriminant: i32) -> *const c_char {
42
+ match discriminant {
43
+ 0 => c"atx".as_ptr(),
44
+ 1 => c"underlined".as_ptr(),
45
+ 2 => c"atx_closed".as_ptr(),
46
+ _ => ptr::null(),
47
+ }
48
+ }
49
+
50
+ /// Parse CodeBlockStyle from string to discriminant.
51
+ ///
52
+ /// Valid values: "indented", "backticks", "tildes"
53
+ /// Returns: 0 = Indented, 1 = Backticks, 2 = Tildes, -1 = Invalid
54
+ ///
55
+ /// # Safety
56
+ ///
57
+ /// - `value` must be a valid null-terminated C string or NULL
58
+ #[unsafe(no_mangle)]
59
+ pub unsafe extern "C" fn kreuzberg_parse_code_block_style(value: *const c_char) -> i32 {
60
+ if value.is_null() {
61
+ return -1;
62
+ }
63
+
64
+ let c_str = match unsafe { CStr::from_ptr(value) }.to_str() {
65
+ Ok(s) => s,
66
+ Err(_) => return -1,
67
+ };
68
+
69
+ match c_str.to_lowercase().as_str() {
70
+ "indented" => 0,
71
+ "backticks" => 1,
72
+ "tildes" => 2,
73
+ _ => -1,
74
+ }
75
+ }
76
+
77
+ /// Convert CodeBlockStyle discriminant to string.
78
+ #[unsafe(no_mangle)]
79
+ pub extern "C" fn kreuzberg_code_block_style_to_string(discriminant: i32) -> *const c_char {
80
+ match discriminant {
81
+ 0 => c"indented".as_ptr(),
82
+ 1 => c"backticks".as_ptr(),
83
+ 2 => c"tildes".as_ptr(),
84
+ _ => ptr::null(),
85
+ }
86
+ }
87
+
88
+ /// Parse HighlightStyle from string to discriminant.
89
+ ///
90
+ /// Valid values: "double_equal" | "==" | "double-equal", "html", "bold", "none"
91
+ /// Returns: 0 = DoubleEqual, 1 = Html, 2 = Bold, 3 = None, -1 = Invalid
92
+ ///
93
+ /// # Safety
94
+ ///
95
+ /// - `value` must be a valid null-terminated C string or NULL
96
+ #[unsafe(no_mangle)]
97
+ pub unsafe extern "C" fn kreuzberg_parse_highlight_style(value: *const c_char) -> i32 {
98
+ if value.is_null() {
99
+ return -1;
100
+ }
101
+
102
+ let c_str = match unsafe { CStr::from_ptr(value) }.to_str() {
103
+ Ok(s) => s,
104
+ Err(_) => return -1,
105
+ };
106
+
107
+ match c_str.to_lowercase().as_str() {
108
+ "double_equal" | "==" | "double-equal" => 0,
109
+ "html" => 1,
110
+ "bold" => 2,
111
+ "none" => 3,
112
+ _ => -1,
113
+ }
114
+ }
115
+
116
+ /// Convert HighlightStyle discriminant to string.
117
+ #[unsafe(no_mangle)]
118
+ pub extern "C" fn kreuzberg_highlight_style_to_string(discriminant: i32) -> *const c_char {
119
+ match discriminant {
120
+ 0 => c"double_equal".as_ptr(),
121
+ 1 => c"html".as_ptr(),
122
+ 2 => c"bold".as_ptr(),
123
+ 3 => c"none".as_ptr(),
124
+ _ => ptr::null(),
125
+ }
126
+ }
127
+
128
+ /// Parse ListIndentType from string to discriminant.
129
+ ///
130
+ /// Valid values: "spaces", "tabs"
131
+ /// Returns: 0 = Spaces, 1 = Tabs, -1 = Invalid
132
+ ///
133
+ /// # Safety
134
+ ///
135
+ /// - `value` must be a valid null-terminated C string or NULL
136
+ #[unsafe(no_mangle)]
137
+ pub unsafe extern "C" fn kreuzberg_parse_list_indent_type(value: *const c_char) -> i32 {
138
+ if value.is_null() {
139
+ return -1;
140
+ }
141
+
142
+ let c_str = match unsafe { CStr::from_ptr(value) }.to_str() {
143
+ Ok(s) => s,
144
+ Err(_) => return -1,
145
+ };
146
+
147
+ match c_str.to_lowercase().as_str() {
148
+ "spaces" => 0,
149
+ "tabs" => 1,
150
+ _ => -1,
151
+ }
152
+ }
153
+
154
+ /// Convert ListIndentType discriminant to string.
155
+ #[unsafe(no_mangle)]
156
+ pub extern "C" fn kreuzberg_list_indent_type_to_string(discriminant: i32) -> *const c_char {
157
+ match discriminant {
158
+ 0 => c"spaces".as_ptr(),
159
+ 1 => c"tabs".as_ptr(),
160
+ _ => ptr::null(),
161
+ }
162
+ }
163
+
164
+ /// Parse WhitespaceMode from string to discriminant.
165
+ ///
166
+ /// Valid values: "default", "preserve", "preserve_inner", "collapse"
167
+ /// Returns: 0 = Default, 1 = Preserve, 2 = PreserveInner, 3 = Collapse, -1 = Invalid
168
+ ///
169
+ /// # Safety
170
+ ///
171
+ /// - `value` must be a valid null-terminated C string or NULL
172
+ #[unsafe(no_mangle)]
173
+ pub unsafe extern "C" fn kreuzberg_parse_whitespace_mode(value: *const c_char) -> i32 {
174
+ if value.is_null() {
175
+ return -1;
176
+ }
177
+
178
+ let c_str = match unsafe { CStr::from_ptr(value) }.to_str() {
179
+ Ok(s) => s,
180
+ Err(_) => return -1,
181
+ };
182
+
183
+ match c_str.to_lowercase().as_str() {
184
+ "default" => 0,
185
+ "preserve" => 1,
186
+ "preserve_inner" | "preserve-inner" => 2,
187
+ "collapse" => 3,
188
+ _ => -1,
189
+ }
190
+ }
191
+
192
+ /// Convert WhitespaceMode discriminant to string.
193
+ #[unsafe(no_mangle)]
194
+ pub extern "C" fn kreuzberg_whitespace_mode_to_string(discriminant: i32) -> *const c_char {
195
+ match discriminant {
196
+ 0 => c"default".as_ptr(),
197
+ 1 => c"preserve".as_ptr(),
198
+ 2 => c"preserve_inner".as_ptr(),
199
+ 3 => c"collapse".as_ptr(),
200
+ _ => ptr::null(),
201
+ }
202
+ }
203
+
204
+ /// Parse NewlineStyle from string to discriminant.
205
+ ///
206
+ /// Valid values: "default", "spaces", "backslash"
207
+ /// Returns: 0 = Default, 1 = Spaces, 2 = Backslash, -1 = Invalid
208
+ ///
209
+ /// # Safety
210
+ ///
211
+ /// - `value` must be a valid null-terminated C string or NULL
212
+ #[unsafe(no_mangle)]
213
+ pub unsafe extern "C" fn kreuzberg_parse_newline_style(value: *const c_char) -> i32 {
214
+ if value.is_null() {
215
+ return -1;
216
+ }
217
+
218
+ let c_str = match unsafe { CStr::from_ptr(value) }.to_str() {
219
+ Ok(s) => s,
220
+ Err(_) => return -1,
221
+ };
222
+
223
+ match c_str.to_lowercase().as_str() {
224
+ "default" => 0,
225
+ "spaces" => 1,
226
+ "backslash" => 2,
227
+ _ => -1,
228
+ }
229
+ }
230
+
231
+ /// Convert NewlineStyle discriminant to string.
232
+ #[unsafe(no_mangle)]
233
+ pub extern "C" fn kreuzberg_newline_style_to_string(discriminant: i32) -> *const c_char {
234
+ match discriminant {
235
+ 0 => c"default".as_ptr(),
236
+ 1 => c"spaces".as_ptr(),
237
+ 2 => c"backslash".as_ptr(),
238
+ _ => ptr::null(),
239
+ }
240
+ }
241
+
242
+ /// Parse PreprocessingPreset from string to discriminant.
243
+ ///
244
+ /// Valid values: "none", "conservative", "aggressive"
245
+ /// Returns: 0 = None, 1 = Conservative, 2 = Aggressive, -1 = Invalid
246
+ ///
247
+ /// # Safety
248
+ ///
249
+ /// - `value` must be a valid null-terminated C string or NULL
250
+ #[unsafe(no_mangle)]
251
+ pub unsafe extern "C" fn kreuzberg_parse_preprocessing_preset(value: *const c_char) -> i32 {
252
+ if value.is_null() {
253
+ return -1;
254
+ }
255
+
256
+ let c_str = match unsafe { CStr::from_ptr(value) }.to_str() {
257
+ Ok(s) => s,
258
+ Err(_) => return -1,
259
+ };
260
+
261
+ match c_str.to_lowercase().as_str() {
262
+ "none" => 0,
263
+ "conservative" => 1,
264
+ "aggressive" => 2,
265
+ _ => -1,
266
+ }
267
+ }
268
+
269
+ /// Convert PreprocessingPreset discriminant to string.
270
+ #[unsafe(no_mangle)]
271
+ pub extern "C" fn kreuzberg_preprocessing_preset_to_string(discriminant: i32) -> *const c_char {
272
+ match discriminant {
273
+ 0 => c"none".as_ptr(),
274
+ 1 => c"conservative".as_ptr(),
275
+ 2 => c"aggressive".as_ptr(),
276
+ _ => ptr::null(),
277
+ }
278
+ }
279
+
280
+ #[cfg(test)]
281
+ mod tests {
282
+ use super::*;
283
+ use std::ffi::CString;
284
+
285
+ #[test]
286
+ fn test_heading_style_parsing() {
287
+ unsafe {
288
+ let atx = CString::new("atx").unwrap();
289
+ assert_eq!(kreuzberg_parse_heading_style(atx.as_ptr()), 0);
290
+
291
+ let underlined = CString::new("underlined").unwrap();
292
+ assert_eq!(kreuzberg_parse_heading_style(underlined.as_ptr()), 1);
293
+
294
+ let atx_closed = CString::new("atx_closed").unwrap();
295
+ assert_eq!(kreuzberg_parse_heading_style(atx_closed.as_ptr()), 2);
296
+
297
+ let atx_closed_dash = CString::new("atx-closed").unwrap();
298
+ assert_eq!(kreuzberg_parse_heading_style(atx_closed_dash.as_ptr()), 2);
299
+
300
+ let invalid = CString::new("invalid").unwrap();
301
+ assert_eq!(kreuzberg_parse_heading_style(invalid.as_ptr()), -1);
302
+
303
+ assert_eq!(kreuzberg_parse_heading_style(ptr::null()), -1);
304
+ }
305
+ }
306
+
307
+ #[test]
308
+ fn test_heading_style_to_string() {
309
+ unsafe {
310
+ assert_eq!(
311
+ CStr::from_ptr(kreuzberg_heading_style_to_string(0)).to_str().unwrap(),
312
+ "atx"
313
+ );
314
+ assert_eq!(
315
+ CStr::from_ptr(kreuzberg_heading_style_to_string(1)).to_str().unwrap(),
316
+ "underlined"
317
+ );
318
+ assert_eq!(
319
+ CStr::from_ptr(kreuzberg_heading_style_to_string(2)).to_str().unwrap(),
320
+ "atx_closed"
321
+ );
322
+ assert!(kreuzberg_heading_style_to_string(99).is_null());
323
+ }
324
+ }
325
+
326
+ #[test]
327
+ fn test_code_block_style_parsing() {
328
+ unsafe {
329
+ let indented = CString::new("indented").unwrap();
330
+ assert_eq!(kreuzberg_parse_code_block_style(indented.as_ptr()), 0);
331
+
332
+ let backticks = CString::new("backticks").unwrap();
333
+ assert_eq!(kreuzberg_parse_code_block_style(backticks.as_ptr()), 1);
334
+
335
+ let tildes = CString::new("tildes").unwrap();
336
+ assert_eq!(kreuzberg_parse_code_block_style(tildes.as_ptr()), 2);
337
+
338
+ let invalid = CString::new("invalid").unwrap();
339
+ assert_eq!(kreuzberg_parse_code_block_style(invalid.as_ptr()), -1);
340
+ }
341
+ }
342
+
343
+ #[test]
344
+ fn test_highlight_style_parsing() {
345
+ unsafe {
346
+ let double_eq = CString::new("double_equal").unwrap();
347
+ assert_eq!(kreuzberg_parse_highlight_style(double_eq.as_ptr()), 0);
348
+
349
+ let eq_symbols = CString::new("==").unwrap();
350
+ assert_eq!(kreuzberg_parse_highlight_style(eq_symbols.as_ptr()), 0);
351
+
352
+ let html = CString::new("html").unwrap();
353
+ assert_eq!(kreuzberg_parse_highlight_style(html.as_ptr()), 1);
354
+
355
+ let bold = CString::new("bold").unwrap();
356
+ assert_eq!(kreuzberg_parse_highlight_style(bold.as_ptr()), 2);
357
+
358
+ let none = CString::new("none").unwrap();
359
+ assert_eq!(kreuzberg_parse_highlight_style(none.as_ptr()), 3);
360
+ }
361
+ }
362
+
363
+ #[test]
364
+ fn test_list_indent_type_parsing() {
365
+ unsafe {
366
+ let spaces = CString::new("spaces").unwrap();
367
+ assert_eq!(kreuzberg_parse_list_indent_type(spaces.as_ptr()), 0);
368
+
369
+ let tabs = CString::new("tabs").unwrap();
370
+ assert_eq!(kreuzberg_parse_list_indent_type(tabs.as_ptr()), 1);
371
+
372
+ let invalid = CString::new("invalid").unwrap();
373
+ assert_eq!(kreuzberg_parse_list_indent_type(invalid.as_ptr()), -1);
374
+ }
375
+ }
376
+
377
+ #[test]
378
+ fn test_whitespace_mode_parsing() {
379
+ unsafe {
380
+ let default = CString::new("default").unwrap();
381
+ assert_eq!(kreuzberg_parse_whitespace_mode(default.as_ptr()), 0);
382
+
383
+ let preserve = CString::new("preserve").unwrap();
384
+ assert_eq!(kreuzberg_parse_whitespace_mode(preserve.as_ptr()), 1);
385
+
386
+ let preserve_inner = CString::new("preserve_inner").unwrap();
387
+ assert_eq!(kreuzberg_parse_whitespace_mode(preserve_inner.as_ptr()), 2);
388
+
389
+ let collapse = CString::new("collapse").unwrap();
390
+ assert_eq!(kreuzberg_parse_whitespace_mode(collapse.as_ptr()), 3);
391
+ }
392
+ }
393
+
394
+ #[test]
395
+ fn test_newline_style_parsing() {
396
+ unsafe {
397
+ let default = CString::new("default").unwrap();
398
+ assert_eq!(kreuzberg_parse_newline_style(default.as_ptr()), 0);
399
+
400
+ let spaces = CString::new("spaces").unwrap();
401
+ assert_eq!(kreuzberg_parse_newline_style(spaces.as_ptr()), 1);
402
+
403
+ let backslash = CString::new("backslash").unwrap();
404
+ assert_eq!(kreuzberg_parse_newline_style(backslash.as_ptr()), 2);
405
+ }
406
+ }
407
+
408
+ #[test]
409
+ fn test_preprocessing_preset_parsing() {
410
+ unsafe {
411
+ let none = CString::new("none").unwrap();
412
+ assert_eq!(kreuzberg_parse_preprocessing_preset(none.as_ptr()), 0);
413
+
414
+ let conservative = CString::new("conservative").unwrap();
415
+ assert_eq!(kreuzberg_parse_preprocessing_preset(conservative.as_ptr()), 1);
416
+
417
+ let aggressive = CString::new("aggressive").unwrap();
418
+ assert_eq!(kreuzberg_parse_preprocessing_preset(aggressive.as_ptr()), 2);
419
+ }
420
+ }
421
+ }
@@ -5,9 +5,11 @@
5
5
 
6
6
  mod batch_streaming;
7
7
  mod config;
8
+ mod config_builder;
8
9
  mod error;
9
10
  mod extraction;
10
11
  mod helpers;
12
+ mod html_options;
11
13
  mod memory;
12
14
  mod mime;
13
15
  mod panic_shield;
@@ -28,6 +30,13 @@ pub use config::{
28
30
  kreuzberg_config_get_field, kreuzberg_config_is_valid, kreuzberg_config_merge, kreuzberg_config_to_json,
29
31
  kreuzberg_get_embedding_preset, kreuzberg_list_embedding_presets, kreuzberg_load_extraction_config_from_file,
30
32
  };
33
+ pub use config_builder::{
34
+ kreuzberg_config_builder_build, kreuzberg_config_builder_free, kreuzberg_config_builder_new,
35
+ kreuzberg_config_builder_set_chunking, kreuzberg_config_builder_set_image_extraction,
36
+ kreuzberg_config_builder_set_language_detection, kreuzberg_config_builder_set_ocr,
37
+ kreuzberg_config_builder_set_pdf, kreuzberg_config_builder_set_post_processor,
38
+ kreuzberg_config_builder_set_use_cache,
39
+ };
31
40
  pub use error::ErrorCode as KreuzbergErrorCode;
32
41
  pub use error::{
33
42
  CErrorDetails, kreuzberg_classify_error, kreuzberg_error_code_count, kreuzberg_error_code_description,
@@ -40,6 +49,13 @@ pub use extraction::{
40
49
  kreuzberg_extract_bytes_sync_with_config, kreuzberg_extract_file_sync, kreuzberg_extract_file_sync_with_config,
41
50
  };
42
51
  pub use helpers::*;
52
+ pub use html_options::{
53
+ kreuzberg_code_block_style_to_string, kreuzberg_heading_style_to_string, kreuzberg_highlight_style_to_string,
54
+ kreuzberg_list_indent_type_to_string, kreuzberg_newline_style_to_string, kreuzberg_parse_code_block_style,
55
+ kreuzberg_parse_heading_style, kreuzberg_parse_highlight_style, kreuzberg_parse_list_indent_type,
56
+ kreuzberg_parse_newline_style, kreuzberg_parse_preprocessing_preset, kreuzberg_parse_whitespace_mode,
57
+ kreuzberg_preprocessing_preset_to_string, kreuzberg_whitespace_mode_to_string,
58
+ };
43
59
  pub use memory::{kreuzberg_clone_string, kreuzberg_free_batch_result, kreuzberg_free_result, kreuzberg_free_string};
44
60
  pub use mime::{
45
61
  kreuzberg_detect_mime_type, kreuzberg_detect_mime_type_from_bytes, kreuzberg_detect_mime_type_from_path,
@@ -139,6 +139,17 @@ macro_rules! ffi_panic_guard {
139
139
  }
140
140
  }
141
141
  }};
142
+ ($function_name:expr, $body:expr, $default:expr) => {{
143
+ match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| $body)) {
144
+ Ok(result) => result,
145
+ Err(panic_info) => {
146
+ let context =
147
+ kreuzberg::panic_context::PanicContext::new(file!(), line!(), $function_name, panic_info.as_ref());
148
+ $crate::panic_shield::set_structured_error($crate::panic_shield::StructuredError::from_panic(context));
149
+ $default
150
+ }
151
+ }
152
+ }};
142
153
  }
143
154
 
144
155
  /// Macro to wrap FFI functions that return bool with panic catching.
@@ -167,6 +167,8 @@ impl OcrBackend for FfiOcrBackend {
167
167
  chunks: None,
168
168
  images: None,
169
169
  pages: None,
170
+ djot_content: None,
171
+ elements: None,
170
172
  })
171
173
  }
172
174