kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,1058 @@
1
+ //! Configuration parsing and conversion for Ruby bindings
2
+ //!
3
+ //! Handles conversion between Ruby Hash configurations and Rust config types.
4
+ //! Includes parsing for all nested configuration structures.
5
+
6
+ use crate::error_handling::{runtime_error, validation_error};
7
+ use crate::helpers::{get_kw, json_value_to_ruby, ruby_value_to_json, symbol_to_string};
8
+
9
+ use html_to_markdown_rs::options::{
10
+ CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
11
+ PreprocessingPreset,
12
+ };
13
+ use html_to_markdown_rs::WhitespaceMode;
14
+ use kreuzberg::core::config::PageConfig;
15
+ use kreuzberg::keywords::{
16
+ KeywordAlgorithm as RustKeywordAlgorithm, KeywordConfig as RustKeywordConfig, RakeParams as RustRakeParams,
17
+ YakeParams as RustYakeParams,
18
+ };
19
+ use kreuzberg::types::TesseractConfig as RustTesseractConfig;
20
+ use kreuzberg::pdf::HierarchyConfig;
21
+ use kreuzberg::{
22
+ ChunkingConfig, EmbeddingConfig, ExtractionConfig, ImageExtractionConfig, ImagePreprocessingConfig,
23
+ LanguageDetectionConfig, OcrConfig, OutputFormat, PdfConfig, PostProcessorConfig, TokenReductionConfig,
24
+ };
25
+ use magnus::{Error, RArray, RHash, Ruby, TryConvert, Value};
26
+ use magnus::value::ReprValue;
27
+ use std::fs;
28
+
29
+ /// Parse OcrConfig from Ruby Hash
30
+ pub fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
31
+ let backend = if let Some(val) = get_kw(ruby, hash, "backend") {
32
+ symbol_to_string(val)?
33
+ } else {
34
+ "tesseract".to_string()
35
+ };
36
+
37
+ let language = if let Some(val) = get_kw(ruby, hash, "language") {
38
+ symbol_to_string(val)?
39
+ } else {
40
+ "eng".to_string()
41
+ };
42
+
43
+ let mut config = OcrConfig {
44
+ backend,
45
+ language,
46
+ tesseract_config: None,
47
+ output_format: None,
48
+ };
49
+
50
+ if let Some(val) = get_kw(ruby, hash, "tesseract_config")
51
+ && !val.is_nil()
52
+ {
53
+ let tc_json = ruby_value_to_json(val)?;
54
+ let parsed: RustTesseractConfig =
55
+ serde_json::from_value(tc_json).map_err(|e| runtime_error(format!("Invalid tesseract_config: {}", e)))?;
56
+ config.tesseract_config = Some(parsed);
57
+ }
58
+
59
+ Ok(config)
60
+ }
61
+
62
+ /// Parse ChunkingConfig from Ruby Hash
63
+ pub fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig, Error> {
64
+ let max_chars = if let Some(val) = get_kw(ruby, hash, "max_chars") {
65
+ usize::try_convert(val)?
66
+ } else {
67
+ 1000
68
+ };
69
+
70
+ let max_overlap = if let Some(val) = get_kw(ruby, hash, "max_overlap") {
71
+ usize::try_convert(val)?
72
+ } else {
73
+ 200
74
+ };
75
+
76
+ let preset = if let Some(val) = get_kw(ruby, hash, "preset")
77
+ && !val.is_nil()
78
+ {
79
+ Some(symbol_to_string(val)?)
80
+ } else {
81
+ None
82
+ };
83
+
84
+ let embedding = if let Some(val) = get_kw(ruby, hash, "embedding")
85
+ && !val.is_nil()
86
+ {
87
+ let json_value = ruby_value_to_json(val)?;
88
+ let parsed: EmbeddingConfig = serde_json::from_value(json_value)
89
+ .map_err(|e| runtime_error(format!("Invalid chunking.embedding: {}", e)))?;
90
+ Some(parsed)
91
+ } else {
92
+ None
93
+ };
94
+
95
+ let config = ChunkingConfig {
96
+ max_chars,
97
+ max_overlap,
98
+ embedding,
99
+ preset,
100
+ };
101
+
102
+ Ok(config)
103
+ }
104
+
105
+ /// Parse LanguageDetectionConfig from Ruby Hash
106
+ pub fn parse_language_detection_config(ruby: &Ruby, hash: RHash) -> Result<LanguageDetectionConfig, Error> {
107
+ let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
108
+ bool::try_convert(val)?
109
+ } else {
110
+ true
111
+ };
112
+
113
+ let min_confidence = if let Some(val) = get_kw(ruby, hash, "min_confidence") {
114
+ f64::try_convert(val)?
115
+ } else {
116
+ 0.8
117
+ };
118
+
119
+ let detect_multiple = if let Some(val) = get_kw(ruby, hash, "detect_multiple") {
120
+ bool::try_convert(val)?
121
+ } else {
122
+ false
123
+ };
124
+
125
+ let config = LanguageDetectionConfig {
126
+ enabled,
127
+ min_confidence,
128
+ detect_multiple,
129
+ };
130
+
131
+ Ok(config)
132
+ }
133
+
134
+ /// Parse HierarchyConfig from Ruby Hash
135
+ pub fn parse_hierarchy_config(ruby: &Ruby, hash: RHash) -> Result<HierarchyConfig, Error> {
136
+ let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
137
+ bool::try_convert(val)?
138
+ } else {
139
+ true
140
+ };
141
+
142
+ let k_clusters = if let Some(val) = get_kw(ruby, hash, "k_clusters") {
143
+ usize::try_convert(val)?
144
+ } else {
145
+ 6
146
+ };
147
+
148
+ let include_bbox = if let Some(val) = get_kw(ruby, hash, "include_bbox") {
149
+ bool::try_convert(val)?
150
+ } else {
151
+ true
152
+ };
153
+
154
+ let ocr_coverage_threshold = if let Some(val) = get_kw(ruby, hash, "ocr_coverage_threshold") {
155
+ if !val.is_nil() {
156
+ Some(f64::try_convert(val)? as f32)
157
+ } else {
158
+ None
159
+ }
160
+ } else {
161
+ None
162
+ };
163
+
164
+ let config = HierarchyConfig {
165
+ enabled,
166
+ k_clusters,
167
+ include_bbox,
168
+ ocr_coverage_threshold,
169
+ };
170
+
171
+ Ok(config)
172
+ }
173
+
174
+ /// Parse PdfConfig from Ruby Hash
175
+ pub fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
176
+ let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
177
+ bool::try_convert(val)?
178
+ } else {
179
+ false
180
+ };
181
+
182
+ let passwords = if let Some(val) = get_kw(ruby, hash, "passwords") {
183
+ if !val.is_nil() {
184
+ let arr = RArray::try_convert(val)?;
185
+ Some(arr.to_vec::<String>()?)
186
+ } else {
187
+ None
188
+ }
189
+ } else {
190
+ None
191
+ };
192
+
193
+ let extract_metadata = if let Some(val) = get_kw(ruby, hash, "extract_metadata") {
194
+ bool::try_convert(val)?
195
+ } else {
196
+ true
197
+ };
198
+
199
+ let hierarchy = if let Some(val) = get_kw(ruby, hash, "hierarchy") {
200
+ if !val.is_nil() {
201
+ let h_hash = RHash::try_convert(val)?;
202
+ Some(parse_hierarchy_config(ruby, h_hash)?)
203
+ } else {
204
+ None
205
+ }
206
+ } else {
207
+ None
208
+ };
209
+
210
+ let config = PdfConfig {
211
+ extract_images,
212
+ passwords,
213
+ extract_metadata,
214
+ hierarchy,
215
+ };
216
+
217
+ Ok(config)
218
+ }
219
+
220
+ /// Parse ImageExtractionConfig from Ruby Hash
221
+ pub fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageExtractionConfig, Error> {
222
+ let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
223
+ bool::try_convert(val)?
224
+ } else {
225
+ true
226
+ };
227
+
228
+ let target_dpi = if let Some(val) = get_kw(ruby, hash, "target_dpi") {
229
+ i32::try_convert(val)?
230
+ } else {
231
+ 300
232
+ };
233
+
234
+ let max_image_dimension = if let Some(val) = get_kw(ruby, hash, "max_image_dimension") {
235
+ i32::try_convert(val)?
236
+ } else {
237
+ 4096
238
+ };
239
+
240
+ let auto_adjust_dpi = if let Some(val) = get_kw(ruby, hash, "auto_adjust_dpi") {
241
+ bool::try_convert(val)?
242
+ } else {
243
+ true
244
+ };
245
+
246
+ let min_dpi = if let Some(val) = get_kw(ruby, hash, "min_dpi") {
247
+ i32::try_convert(val)?
248
+ } else {
249
+ 72
250
+ };
251
+
252
+ let max_dpi = if let Some(val) = get_kw(ruby, hash, "max_dpi") {
253
+ i32::try_convert(val)?
254
+ } else {
255
+ 600
256
+ };
257
+
258
+ let config = ImageExtractionConfig {
259
+ extract_images,
260
+ target_dpi,
261
+ max_image_dimension,
262
+ auto_adjust_dpi,
263
+ min_dpi,
264
+ max_dpi,
265
+ };
266
+
267
+ Ok(config)
268
+ }
269
+
270
+ /// Parse ImagePreprocessingConfig from Ruby Hash
271
+ ///
272
+ /// Note: Currently not used in ExtractionConfig but provided for completeness.
273
+ /// ImagePreprocessingConfig is typically used in OCR operations.
274
+ #[allow(dead_code)]
275
+ pub fn parse_image_preprocessing_config(ruby: &Ruby, hash: RHash) -> Result<ImagePreprocessingConfig, Error> {
276
+ let target_dpi = if let Some(val) = get_kw(ruby, hash, "target_dpi") {
277
+ i32::try_convert(val)?
278
+ } else {
279
+ 300
280
+ };
281
+
282
+ let auto_rotate = if let Some(val) = get_kw(ruby, hash, "auto_rotate") {
283
+ bool::try_convert(val)?
284
+ } else {
285
+ true
286
+ };
287
+
288
+ let deskew = if let Some(val) = get_kw(ruby, hash, "deskew") {
289
+ bool::try_convert(val)?
290
+ } else {
291
+ true
292
+ };
293
+
294
+ let denoise = if let Some(val) = get_kw(ruby, hash, "denoise") {
295
+ bool::try_convert(val)?
296
+ } else {
297
+ false
298
+ };
299
+
300
+ let contrast_enhance = if let Some(val) = get_kw(ruby, hash, "contrast_enhance") {
301
+ bool::try_convert(val)?
302
+ } else {
303
+ false
304
+ };
305
+
306
+ let binarization_method = if let Some(val) = get_kw(ruby, hash, "binarization_method") {
307
+ symbol_to_string(val)?
308
+ } else {
309
+ "otsu".to_string()
310
+ };
311
+
312
+ let invert_colors = if let Some(val) = get_kw(ruby, hash, "invert_colors") {
313
+ bool::try_convert(val)?
314
+ } else {
315
+ false
316
+ };
317
+
318
+ let config = ImagePreprocessingConfig {
319
+ target_dpi,
320
+ auto_rotate,
321
+ deskew,
322
+ denoise,
323
+ contrast_enhance,
324
+ binarization_method,
325
+ invert_colors,
326
+ };
327
+
328
+ Ok(config)
329
+ }
330
+
331
+ /// Parse PostProcessorConfig from Ruby Hash
332
+ pub fn parse_postprocessor_config(ruby: &Ruby, hash: RHash) -> Result<PostProcessorConfig, Error> {
333
+ let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
334
+ bool::try_convert(val)?
335
+ } else {
336
+ true
337
+ };
338
+
339
+ let enabled_processors = if let Some(val) = get_kw(ruby, hash, "enabled_processors")
340
+ && !val.is_nil()
341
+ {
342
+ let arr = RArray::try_convert(val)?;
343
+ Some(arr.to_vec::<String>()?)
344
+ } else {
345
+ None
346
+ };
347
+
348
+ let disabled_processors = if let Some(val) = get_kw(ruby, hash, "disabled_processors")
349
+ && !val.is_nil()
350
+ {
351
+ let arr = RArray::try_convert(val)?;
352
+ Some(arr.to_vec::<String>()?)
353
+ } else {
354
+ None
355
+ };
356
+
357
+ let config = PostProcessorConfig {
358
+ enabled,
359
+ enabled_processors,
360
+ disabled_processors,
361
+ enabled_set: None,
362
+ disabled_set: None,
363
+ };
364
+
365
+ Ok(config)
366
+ }
367
+
368
+ /// Parse TokenReductionConfig from Ruby Hash
369
+ pub fn parse_token_reduction_config(ruby: &Ruby, hash: RHash) -> Result<TokenReductionConfig, Error> {
370
+ let mode = if let Some(val) = get_kw(ruby, hash, "mode") {
371
+ symbol_to_string(val)?
372
+ } else {
373
+ "off".to_string()
374
+ };
375
+
376
+ let preserve_important_words = if let Some(val) = get_kw(ruby, hash, "preserve_important_words") {
377
+ bool::try_convert(val)?
378
+ } else {
379
+ true
380
+ };
381
+
382
+ let config = TokenReductionConfig {
383
+ mode,
384
+ preserve_important_words,
385
+ };
386
+
387
+ Ok(config)
388
+ }
389
+
390
+ /// Parse KeywordConfig from Ruby Hash
391
+ pub fn parse_keyword_config(ruby: &Ruby, hash: RHash) -> Result<RustKeywordConfig, Error> {
392
+ let mut config = RustKeywordConfig::default();
393
+
394
+ if let Some(val) = get_kw(ruby, hash, "algorithm") {
395
+ let algo = symbol_to_string(val)?;
396
+ config.algorithm = match algo.to_lowercase().as_str() {
397
+ "yake" => RustKeywordAlgorithm::Yake,
398
+ "rake" => RustKeywordAlgorithm::Rake,
399
+ other => {
400
+ return Err(runtime_error(format!(
401
+ "Invalid keywords.algorithm '{}', expected 'yake' or 'rake'",
402
+ other
403
+ )));
404
+ }
405
+ };
406
+ }
407
+
408
+ if let Some(val) = get_kw(ruby, hash, "max_keywords") {
409
+ config.max_keywords = usize::try_convert(val)?;
410
+ }
411
+
412
+ if let Some(val) = get_kw(ruby, hash, "min_score") {
413
+ config.min_score = f64::try_convert(val)? as f32;
414
+ }
415
+
416
+ if let Some(val) = get_kw(ruby, hash, "ngram_range") {
417
+ let ary = RArray::try_convert(val)?;
418
+ if ary.len() == 2 {
419
+ let values = ary.to_vec::<i64>()?;
420
+ config.ngram_range = (values[0] as usize, values[1] as usize);
421
+ } else {
422
+ return Err(runtime_error("keywords.ngram_range must have exactly two values"));
423
+ }
424
+ }
425
+
426
+ if let Some(val) = get_kw(ruby, hash, "language")
427
+ && !val.is_nil()
428
+ {
429
+ config.language = Some(symbol_to_string(val)?);
430
+ }
431
+
432
+ if let Some(val) = get_kw(ruby, hash, "yake_params")
433
+ && !val.is_nil()
434
+ {
435
+ let yake_hash = RHash::try_convert(val)?;
436
+ let window = if let Some(window_val) = get_kw(ruby, yake_hash, "window_size") {
437
+ usize::try_convert(window_val)?
438
+ } else {
439
+ 2
440
+ };
441
+ config.yake_params = Some(RustYakeParams { window_size: window });
442
+ }
443
+
444
+ if let Some(val) = get_kw(ruby, hash, "rake_params")
445
+ && !val.is_nil()
446
+ {
447
+ let rake_hash = RHash::try_convert(val)?;
448
+ let mut params = RustRakeParams::default();
449
+ if let Some(val) = get_kw(ruby, rake_hash, "min_word_length") {
450
+ params.min_word_length = usize::try_convert(val)?;
451
+ }
452
+ if let Some(val) = get_kw(ruby, rake_hash, "max_words_per_phrase") {
453
+ params.max_words_per_phrase = usize::try_convert(val)?;
454
+ }
455
+ config.rake_params = Some(params);
456
+ }
457
+
458
+ Ok(config)
459
+ }
460
+
461
+ /// Parse HTML conversion options from Ruby Hash
462
+ pub fn parse_html_options(ruby: &Ruby, hash: RHash) -> Result<ConversionOptions, Error> {
463
+ let mut options = ConversionOptions::default();
464
+
465
+ if let Some(val) = get_kw(ruby, hash, "heading_style") {
466
+ let style = symbol_to_string(val)?;
467
+ options.heading_style = match style.to_lowercase().as_str() {
468
+ "atx" => HeadingStyle::Atx,
469
+ "underlined" => HeadingStyle::Underlined,
470
+ "atx_closed" | "atx-closed" => HeadingStyle::AtxClosed,
471
+ other => return Err(runtime_error(format!("Invalid html_options.heading_style '{}'", other))),
472
+ };
473
+ }
474
+
475
+ if let Some(val) = get_kw(ruby, hash, "list_indent_type") {
476
+ let val_str = symbol_to_string(val)?;
477
+ options.list_indent_type = match val_str.to_lowercase().as_str() {
478
+ "spaces" => ListIndentType::Spaces,
479
+ "tabs" => ListIndentType::Tabs,
480
+ other => {
481
+ return Err(runtime_error(format!(
482
+ "Invalid html_options.list_indent_type '{}'",
483
+ other
484
+ )));
485
+ }
486
+ };
487
+ }
488
+
489
+ if let Some(val) = get_kw(ruby, hash, "list_indent_width") {
490
+ options.list_indent_width = usize::try_convert(val)?;
491
+ }
492
+
493
+ if let Some(val) = get_kw(ruby, hash, "bullets") {
494
+ options.bullets = String::try_convert(val)?;
495
+ }
496
+
497
+ if let Some(val) = get_kw(ruby, hash, "strong_em_symbol") {
498
+ let symbol = String::try_convert(val)?;
499
+ let mut chars = symbol.chars();
500
+ options.strong_em_symbol = chars
501
+ .next()
502
+ .ok_or_else(|| runtime_error("html_options.strong_em_symbol must not be empty"))?;
503
+ }
504
+
505
+ if let Some(val) = get_kw(ruby, hash, "escape_asterisks") {
506
+ options.escape_asterisks = bool::try_convert(val)?;
507
+ }
508
+ if let Some(val) = get_kw(ruby, hash, "escape_underscores") {
509
+ options.escape_underscores = bool::try_convert(val)?;
510
+ }
511
+ if let Some(val) = get_kw(ruby, hash, "escape_misc") {
512
+ options.escape_misc = bool::try_convert(val)?;
513
+ }
514
+ if let Some(val) = get_kw(ruby, hash, "escape_ascii") {
515
+ options.escape_ascii = bool::try_convert(val)?;
516
+ }
517
+
518
+ if let Some(val) = get_kw(ruby, hash, "code_language") {
519
+ options.code_language = String::try_convert(val)?;
520
+ }
521
+
522
+ if let Some(val) = get_kw(ruby, hash, "autolinks") {
523
+ options.autolinks = bool::try_convert(val)?;
524
+ }
525
+
526
+ if let Some(val) = get_kw(ruby, hash, "default_title") {
527
+ options.default_title = bool::try_convert(val)?;
528
+ }
529
+
530
+ if let Some(val) = get_kw(ruby, hash, "br_in_tables") {
531
+ options.br_in_tables = bool::try_convert(val)?;
532
+ }
533
+
534
+ if let Some(val) = get_kw(ruby, hash, "hocr_spatial_tables") {
535
+ options.hocr_spatial_tables = bool::try_convert(val)?;
536
+ }
537
+
538
+ if let Some(val) = get_kw(ruby, hash, "highlight_style") {
539
+ let style = symbol_to_string(val)?;
540
+ options.highlight_style = match style.to_lowercase().as_str() {
541
+ "double_equal" | "double-equal" => HighlightStyle::DoubleEqual,
542
+ "html" => HighlightStyle::Html,
543
+ "bold" => HighlightStyle::Bold,
544
+ "none" => HighlightStyle::None,
545
+ other => {
546
+ return Err(runtime_error(format!(
547
+ "Invalid html_options.highlight_style '{}'",
548
+ other
549
+ )));
550
+ }
551
+ };
552
+ }
553
+
554
+ if let Some(val) = get_kw(ruby, hash, "extract_metadata") {
555
+ options.extract_metadata = bool::try_convert(val)?;
556
+ }
557
+
558
+ if let Some(val) = get_kw(ruby, hash, "whitespace_mode") {
559
+ let mode = symbol_to_string(val)?;
560
+ options.whitespace_mode = match mode.to_lowercase().as_str() {
561
+ "normalized" => WhitespaceMode::Normalized,
562
+ "strict" => WhitespaceMode::Strict,
563
+ other => {
564
+ return Err(runtime_error(format!(
565
+ "Invalid html_options.whitespace_mode '{}'",
566
+ other
567
+ )));
568
+ }
569
+ };
570
+ }
571
+
572
+ if let Some(val) = get_kw(ruby, hash, "strip_newlines") {
573
+ options.strip_newlines = bool::try_convert(val)?;
574
+ }
575
+
576
+ if let Some(val) = get_kw(ruby, hash, "wrap") {
577
+ options.wrap = bool::try_convert(val)?;
578
+ }
579
+
580
+ if let Some(val) = get_kw(ruby, hash, "wrap_width") {
581
+ options.wrap_width = usize::try_convert(val)?;
582
+ }
583
+
584
+ if let Some(val) = get_kw(ruby, hash, "convert_as_inline") {
585
+ options.convert_as_inline = bool::try_convert(val)?;
586
+ }
587
+
588
+ if let Some(val) = get_kw(ruby, hash, "sub_symbol") {
589
+ options.sub_symbol = String::try_convert(val)?;
590
+ }
591
+
592
+ if let Some(val) = get_kw(ruby, hash, "sup_symbol") {
593
+ options.sup_symbol = String::try_convert(val)?;
594
+ }
595
+
596
+ if let Some(val) = get_kw(ruby, hash, "newline_style") {
597
+ let style = symbol_to_string(val)?;
598
+ options.newline_style = match style.to_lowercase().as_str() {
599
+ "spaces" => NewlineStyle::Spaces,
600
+ "backslash" => NewlineStyle::Backslash,
601
+ other => return Err(runtime_error(format!("Invalid html_options.newline_style '{}'", other))),
602
+ };
603
+ }
604
+
605
+ if let Some(val) = get_kw(ruby, hash, "code_block_style") {
606
+ let style = symbol_to_string(val)?;
607
+ options.code_block_style = match style.to_lowercase().as_str() {
608
+ "indented" => CodeBlockStyle::Indented,
609
+ "backticks" => CodeBlockStyle::Backticks,
610
+ "tildes" => CodeBlockStyle::Tildes,
611
+ other => {
612
+ return Err(runtime_error(format!(
613
+ "Invalid html_options.code_block_style '{}'",
614
+ other
615
+ )));
616
+ }
617
+ };
618
+ }
619
+
620
+ if let Some(val) = get_kw(ruby, hash, "keep_inline_images_in") {
621
+ let arr = RArray::try_convert(val)?;
622
+ options.keep_inline_images_in = arr.to_vec::<String>()?;
623
+ }
624
+
625
+ if let Some(val) = get_kw(ruby, hash, "encoding") {
626
+ options.encoding = String::try_convert(val)?;
627
+ }
628
+
629
+ if let Some(val) = get_kw(ruby, hash, "debug") {
630
+ options.debug = bool::try_convert(val)?;
631
+ }
632
+
633
+ if let Some(val) = get_kw(ruby, hash, "strip_tags") {
634
+ let arr = RArray::try_convert(val)?;
635
+ options.strip_tags = arr.to_vec::<String>()?;
636
+ }
637
+
638
+ if let Some(val) = get_kw(ruby, hash, "preserve_tags") {
639
+ let arr = RArray::try_convert(val)?;
640
+ options.preserve_tags = arr.to_vec::<String>()?;
641
+ }
642
+
643
+ if let Some(val) = get_kw(ruby, hash, "preprocessing")
644
+ && !val.is_nil()
645
+ {
646
+ let pre_hash = RHash::try_convert(val)?;
647
+ let mut preprocessing = options.preprocessing.clone();
648
+ if let Some(v) = get_kw(ruby, pre_hash, "enabled") {
649
+ preprocessing.enabled = bool::try_convert(v)?;
650
+ }
651
+ if let Some(v) = get_kw(ruby, pre_hash, "preset") {
652
+ let preset = symbol_to_string(v)?;
653
+ preprocessing.preset = match preset.to_lowercase().as_str() {
654
+ "minimal" => PreprocessingPreset::Minimal,
655
+ "standard" => PreprocessingPreset::Standard,
656
+ "aggressive" => PreprocessingPreset::Aggressive,
657
+ other => {
658
+ return Err(runtime_error(format!(
659
+ "Invalid html_options.preprocessing.preset '{}'",
660
+ other
661
+ )));
662
+ }
663
+ };
664
+ }
665
+ if let Some(v) = get_kw(ruby, pre_hash, "remove_navigation") {
666
+ preprocessing.remove_navigation = bool::try_convert(v)?;
667
+ }
668
+ if let Some(v) = get_kw(ruby, pre_hash, "remove_forms") {
669
+ preprocessing.remove_forms = bool::try_convert(v)?;
670
+ }
671
+ options.preprocessing = preprocessing;
672
+ }
673
+
674
+ Ok(options)
675
+ }
676
+
677
+ /// Convert KeywordAlgorithm to string
678
+ #[allow(dead_code)]
679
+ pub fn keyword_algorithm_to_str(algo: RustKeywordAlgorithm) -> &'static str {
680
+ match algo {
681
+ RustKeywordAlgorithm::Yake => "yake",
682
+ RustKeywordAlgorithm::Rake => "rake",
683
+ }
684
+ }
685
+
686
+ /// Convert KeywordConfig to Ruby Hash
687
+ #[allow(dead_code)]
688
+ pub fn keyword_config_to_ruby_hash(ruby: &Ruby, config: &RustKeywordConfig) -> Result<RHash, Error> {
689
+ let hash = ruby.hash_new();
690
+ hash.aset("algorithm", keyword_algorithm_to_str(config.algorithm))?;
691
+ hash.aset("max_keywords", config.max_keywords as i64)?;
692
+ hash.aset("min_score", config.min_score)?;
693
+ hash.aset("language", config.language.clone().unwrap_or_default())?;
694
+
695
+ let range_array = ruby.ary_new();
696
+ range_array.push(config.ngram_range.0 as i64)?;
697
+ range_array.push(config.ngram_range.1 as i64)?;
698
+ hash.aset("ngram_range", range_array)?;
699
+
700
+ if let Some(yake) = &config.yake_params {
701
+ let yake_hash = ruby.hash_new();
702
+ yake_hash.aset("window_size", yake.window_size as i64)?;
703
+ hash.aset("yake_params", yake_hash)?;
704
+ }
705
+
706
+ if let Some(rake) = &config.rake_params {
707
+ let rake_hash = ruby.hash_new();
708
+ rake_hash.aset("min_word_length", rake.min_word_length as i64)?;
709
+ rake_hash.aset("max_words_per_phrase", rake.max_words_per_phrase as i64)?;
710
+ hash.aset("rake_params", rake_hash)?;
711
+ }
712
+
713
+ Ok(hash)
714
+ }
715
+
716
+ /// Convert HTML conversion options to Ruby Hash
717
+ #[allow(dead_code)]
718
+ pub fn html_options_to_ruby_hash(ruby: &Ruby, options: &ConversionOptions) -> Result<RHash, Error> {
719
+ let hash = ruby.hash_new();
720
+ hash.aset(
721
+ "heading_style",
722
+ match options.heading_style {
723
+ HeadingStyle::Atx => "atx",
724
+ HeadingStyle::Underlined => "underlined",
725
+ HeadingStyle::AtxClosed => "atx_closed",
726
+ },
727
+ )?;
728
+ hash.aset(
729
+ "list_indent_type",
730
+ match options.list_indent_type {
731
+ ListIndentType::Spaces => "spaces",
732
+ ListIndentType::Tabs => "tabs",
733
+ },
734
+ )?;
735
+ hash.aset("list_indent_width", options.list_indent_width as i64)?;
736
+ hash.aset("bullets", options.bullets.clone())?;
737
+ hash.aset("strong_em_symbol", options.strong_em_symbol.to_string())?;
738
+ hash.aset("escape_asterisks", options.escape_asterisks)?;
739
+ hash.aset("escape_underscores", options.escape_underscores)?;
740
+ hash.aset("escape_misc", options.escape_misc)?;
741
+ hash.aset("escape_ascii", options.escape_ascii)?;
742
+ hash.aset("code_language", options.code_language.clone())?;
743
+ hash.aset("autolinks", options.autolinks)?;
744
+ hash.aset("default_title", options.default_title)?;
745
+ hash.aset("br_in_tables", options.br_in_tables)?;
746
+ hash.aset("hocr_spatial_tables", options.hocr_spatial_tables)?;
747
+ hash.aset(
748
+ "highlight_style",
749
+ match options.highlight_style {
750
+ HighlightStyle::DoubleEqual => "double_equal",
751
+ HighlightStyle::Html => "html",
752
+ HighlightStyle::Bold => "bold",
753
+ HighlightStyle::None => "none",
754
+ },
755
+ )?;
756
+ hash.aset("extract_metadata", options.extract_metadata)?;
757
+ hash.aset(
758
+ "whitespace_mode",
759
+ match options.whitespace_mode {
760
+ WhitespaceMode::Normalized => "normalized",
761
+ WhitespaceMode::Strict => "strict",
762
+ },
763
+ )?;
764
+ hash.aset("strip_newlines", options.strip_newlines)?;
765
+ hash.aset("wrap", options.wrap)?;
766
+ hash.aset("wrap_width", options.wrap_width as i64)?;
767
+ hash.aset("convert_as_inline", options.convert_as_inline)?;
768
+ hash.aset("sub_symbol", options.sub_symbol.clone())?;
769
+ hash.aset("sup_symbol", options.sup_symbol.clone())?;
770
+ hash.aset(
771
+ "newline_style",
772
+ match options.newline_style {
773
+ NewlineStyle::Spaces => "spaces",
774
+ NewlineStyle::Backslash => "backslash",
775
+ },
776
+ )?;
777
+ hash.aset(
778
+ "code_block_style",
779
+ match options.code_block_style {
780
+ CodeBlockStyle::Indented => "indented",
781
+ CodeBlockStyle::Backticks => "backticks",
782
+ CodeBlockStyle::Tildes => "tildes",
783
+ },
784
+ )?;
785
+
786
+ let keep_inline = ruby.ary_new();
787
+ for tag in &options.keep_inline_images_in {
788
+ keep_inline.push(tag.as_str())?;
789
+ }
790
+ hash.aset("keep_inline_images_in", keep_inline)?;
791
+
792
+ hash.aset("encoding", options.encoding.clone())?;
793
+ hash.aset("debug", options.debug)?;
794
+
795
+ let strip_tags = ruby.ary_new();
796
+ for tag in &options.strip_tags {
797
+ strip_tags.push(tag.as_str())?;
798
+ }
799
+ hash.aset("strip_tags", strip_tags)?;
800
+
801
+ let preserve_tags = ruby.ary_new();
802
+ for tag in &options.preserve_tags {
803
+ preserve_tags.push(tag.as_str())?;
804
+ }
805
+ hash.aset("preserve_tags", preserve_tags)?;
806
+
807
+ let pre_hash = ruby.hash_new();
808
+ pre_hash.aset("enabled", options.preprocessing.enabled)?;
809
+ pre_hash.aset(
810
+ "preset",
811
+ match options.preprocessing.preset {
812
+ PreprocessingPreset::Minimal => "minimal",
813
+ PreprocessingPreset::Standard => "standard",
814
+ PreprocessingPreset::Aggressive => "aggressive",
815
+ },
816
+ )?;
817
+ pre_hash.aset("remove_navigation", options.preprocessing.remove_navigation)?;
818
+ pre_hash.aset("remove_forms", options.preprocessing.remove_forms)?;
819
+ hash.aset("preprocessing", pre_hash)?;
820
+
821
+ Ok(hash)
822
+ }
823
+
824
+ /// Parse PageConfig from Ruby Hash
825
+ pub fn parse_page_config(ruby: &Ruby, hash: RHash) -> Result<PageConfig, Error> {
826
+ let extract_pages = if let Some(val) = get_kw(ruby, hash, "extract_pages") {
827
+ bool::try_convert(val)?
828
+ } else {
829
+ false
830
+ };
831
+
832
+ let insert_page_markers = if let Some(val) = get_kw(ruby, hash, "insert_page_markers") {
833
+ bool::try_convert(val)?
834
+ } else {
835
+ false
836
+ };
837
+
838
+ let marker_format = if let Some(val) = get_kw(ruby, hash, "marker_format") {
839
+ String::try_convert(val)?
840
+ } else {
841
+ "\n\n<!-- PAGE {page_num} -->\n\n".to_string()
842
+ };
843
+
844
+ let config = PageConfig {
845
+ extract_pages,
846
+ insert_page_markers,
847
+ marker_format,
848
+ };
849
+
850
+ Ok(config)
851
+ }
852
+
853
+ /// Parse ExtractionConfig from Ruby Hash
854
+ pub fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
855
+ let mut config = ExtractionConfig::default();
856
+
857
+ if let Some(hash) = opts {
858
+ if let Some(val) = get_kw(ruby, hash, "use_cache") {
859
+ config.use_cache = bool::try_convert(val)?;
860
+ }
861
+
862
+ if let Some(val) = get_kw(ruby, hash, "enable_quality_processing") {
863
+ config.enable_quality_processing = bool::try_convert(val)?;
864
+ }
865
+
866
+ if let Some(val) = get_kw(ruby, hash, "force_ocr") {
867
+ config.force_ocr = bool::try_convert(val)?;
868
+ }
869
+
870
+ if let Some(val) = get_kw(ruby, hash, "ocr")
871
+ && !val.is_nil()
872
+ {
873
+ let ocr_hash = RHash::try_convert(val)?;
874
+ config.ocr = Some(parse_ocr_config(ruby, ocr_hash)?);
875
+ }
876
+
877
+ if let Some(val) = get_kw(ruby, hash, "chunking")
878
+ && !val.is_nil()
879
+ {
880
+ let chunking_hash = RHash::try_convert(val)?;
881
+ config.chunking = Some(parse_chunking_config(ruby, chunking_hash)?);
882
+ }
883
+
884
+ if let Some(val) = get_kw(ruby, hash, "language_detection")
885
+ && !val.is_nil()
886
+ {
887
+ let lang_hash = RHash::try_convert(val)?;
888
+ config.language_detection = Some(parse_language_detection_config(ruby, lang_hash)?);
889
+ }
890
+
891
+ if let Some(val) = get_kw(ruby, hash, "pdf_options")
892
+ && !val.is_nil()
893
+ {
894
+ let pdf_hash = RHash::try_convert(val)?;
895
+ config.pdf_options = Some(parse_pdf_config(ruby, pdf_hash)?);
896
+ }
897
+
898
+ if let Some(val) = get_kw(ruby, hash, "images")
899
+ && !val.is_nil()
900
+ {
901
+ let images_hash = RHash::try_convert(val)?;
902
+ config.images = Some(parse_image_extraction_config(ruby, images_hash)?);
903
+ }
904
+
905
+ if let Some(val) = get_kw(ruby, hash, "postprocessor")
906
+ && !val.is_nil()
907
+ {
908
+ let postprocessor_hash = RHash::try_convert(val)?;
909
+ config.postprocessor = Some(parse_postprocessor_config(ruby, postprocessor_hash)?);
910
+ }
911
+
912
+ if let Some(val) = get_kw(ruby, hash, "token_reduction")
913
+ && !val.is_nil()
914
+ {
915
+ let token_reduction_hash = RHash::try_convert(val)?;
916
+ config.token_reduction = Some(parse_token_reduction_config(ruby, token_reduction_hash)?);
917
+ }
918
+
919
+ if let Some(val) = get_kw(ruby, hash, "keywords")
920
+ && !val.is_nil()
921
+ {
922
+ let keywords_hash = RHash::try_convert(val)?;
923
+ config.keywords = Some(parse_keyword_config(ruby, keywords_hash)?);
924
+ }
925
+
926
+ if let Some(val) = get_kw(ruby, hash, "html_options")
927
+ && !val.is_nil()
928
+ {
929
+ let html_hash = RHash::try_convert(val)?;
930
+ config.html_options = Some(parse_html_options(ruby, html_hash)?);
931
+ }
932
+
933
+ if let Some(val) = get_kw(ruby, hash, "pages")
934
+ && !val.is_nil()
935
+ {
936
+ let pages_hash = RHash::try_convert(val)?;
937
+ config.pages = Some(parse_page_config(ruby, pages_hash)?);
938
+ }
939
+
940
+ if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
941
+ let value = usize::try_convert(val)?;
942
+ config.max_concurrent_extractions = Some(value);
943
+ }
944
+
945
+ if let Some(val) = get_kw(ruby, hash, "result_format") {
946
+ let format_str = String::try_convert(val)?;
947
+ config.result_format = match format_str.as_str() {
948
+ "unified" | "Unified" => kreuzberg::types::OutputFormat::Unified,
949
+ "element_based" | "ElementBased" | "elements" => kreuzberg::types::OutputFormat::ElementBased,
950
+ _ => {
951
+ return Err(runtime_error(format!(
952
+ "Invalid result_format: '{}'. Expected 'unified' or 'element_based'",
953
+ format_str
954
+ )))
955
+ }
956
+ };
957
+ }
958
+
959
+ if let Some(val) = get_kw(ruby, hash, "output_format") {
960
+ let format_str = String::try_convert(val)?;
961
+ config.output_format = match format_str.as_str() {
962
+ "plain" | "Plain" => OutputFormat::Plain,
963
+ "markdown" | "Markdown" => OutputFormat::Markdown,
964
+ "djot" | "Djot" => OutputFormat::Djot,
965
+ "html" | "Html" => OutputFormat::Html,
966
+ _ => {
967
+ return Err(runtime_error(format!(
968
+ "Invalid output_format: '{}'. Expected 'plain', 'markdown', 'djot', or 'html'",
969
+ format_str
970
+ )))
971
+ }
972
+ };
973
+ }
974
+ }
975
+
976
+ Ok(config)
977
+ }
978
+
979
+ /// Load extraction config from file
980
+ ///
981
+ /// Supports TOML, YAML, and JSON file formats. The format is detected from the file extension.
982
+ pub fn config_from_file(path: String) -> Result<RHash, Error> {
983
+ use std::path::Path;
984
+
985
+ let ruby = Ruby::get().expect("Ruby not initialized");
986
+ let file_path = Path::new(&path);
987
+
988
+ let content = fs::read_to_string(&path)
989
+ .map_err(|e| validation_error(format!("Failed to read config file '{}': {}", path, e)))?;
990
+
991
+ // Detect file format from extension
992
+ let extension = file_path
993
+ .extension()
994
+ .and_then(|ext| ext.to_str())
995
+ .map(|s| s.to_lowercase());
996
+
997
+ let json_value: serde_json::Value = match extension.as_deref() {
998
+ Some("toml") => {
999
+ toml::from_str(&content)
1000
+ .map_err(|e| validation_error(format!("Invalid TOML in config file '{}': {}", path, e)))?
1001
+ }
1002
+ Some("yaml") | Some("yml") => {
1003
+ serde_yaml_ng::from_str(&content)
1004
+ .map_err(|e| validation_error(format!("Invalid YAML in config file '{}': {}", path, e)))?
1005
+ }
1006
+ Some("json") => {
1007
+ serde_json::from_str(&content)
1008
+ .map_err(|e| validation_error(format!("Invalid JSON in config file '{}': {}", path, e)))?
1009
+ }
1010
+ Some(ext) => {
1011
+ return Err(validation_error(format!(
1012
+ "Unsupported config file format: .{}. Supported formats: .toml, .yaml, .yml, .json",
1013
+ ext
1014
+ )));
1015
+ }
1016
+ None => {
1017
+ return Err(validation_error(format!(
1018
+ "Cannot determine file format: no extension found in '{}'",
1019
+ path
1020
+ )));
1021
+ }
1022
+ };
1023
+
1024
+ json_value_to_ruby(&ruby, &json_value)
1025
+ .and_then(|v| magnus::RHash::try_convert(v).map_err(|_| validation_error("Config must be a Hash")))
1026
+ }
1027
+
1028
+ /// Discover extraction config from current directory
1029
+ pub fn config_discover() -> Result<Value, Error> {
1030
+ let ruby = Ruby::get().expect("Ruby not initialized");
1031
+
1032
+ // Search for config files in order of precedence
1033
+ let config_files = vec![
1034
+ ("kreuzberg.toml", "toml"),
1035
+ ("kreuzberg.yaml", "yaml"),
1036
+ ("kreuzberg.yml", "yaml"),
1037
+ ("kreuzberg.json", "json"),
1038
+ (".kreuzbergrc", "json"),
1039
+ ];
1040
+
1041
+ for (name, format) in config_files {
1042
+ if let Ok(content) = fs::read_to_string(name) {
1043
+ let json_value: serde_json::Value = match format {
1044
+ "toml" => toml::from_str(&content)
1045
+ .map_err(|e| validation_error(format!("Invalid TOML in {}: {}", name, e)))?,
1046
+ "yaml" => serde_yaml_ng::from_str(&content)
1047
+ .map_err(|e| validation_error(format!("Invalid YAML in {}: {}", name, e)))?,
1048
+ "json" => serde_json::from_str(&content)
1049
+ .map_err(|e| validation_error(format!("Invalid JSON in {}: {}", name, e)))?,
1050
+ _ => unreachable!(),
1051
+ };
1052
+ return json_value_to_ruby(&ruby, &json_value);
1053
+ }
1054
+ }
1055
+
1056
+ // Return nil if no config found
1057
+ Ok(ruby.qnil().as_value())
1058
+ }