kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,318 @@
1
+ //! HTML options parsing from JSON
2
+ //!
3
+ //! Handles the complex nested structure of HTML conversion options.
4
+
5
+ use html_to_markdown_rs::options::{
6
+ CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingPreset,
7
+ WhitespaceMode,
8
+ };
9
+
10
+ type FfiResult<T> = std::result::Result<T, String>;
11
+
12
+ /// Parse enum value from optional JSON value
13
+ fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
14
+ where
15
+ F: Fn(&str) -> FfiResult<T>,
16
+ {
17
+ if let Some(raw) = value {
18
+ let text = raw
19
+ .as_str()
20
+ .ok_or_else(|| "Expected string for enum field".to_string())?;
21
+ return parse_fn(text).map(Some);
22
+ }
23
+ Ok(None)
24
+ }
25
+
26
+ /// Parse HeadingStyle from string
27
+ fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
28
+ match value.to_lowercase().as_str() {
29
+ "atx" => Ok(HeadingStyle::Atx),
30
+ "underlined" => Ok(HeadingStyle::Underlined),
31
+ "atx_closed" => Ok(HeadingStyle::AtxClosed),
32
+ other => Err(format!(
33
+ "Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
34
+ other
35
+ )),
36
+ }
37
+ }
38
+
39
+ /// Parse ListIndentType from string
40
+ fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
41
+ match value.to_lowercase().as_str() {
42
+ "spaces" => Ok(ListIndentType::Spaces),
43
+ "tabs" => Ok(ListIndentType::Tabs),
44
+ other => Err(format!(
45
+ "Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
46
+ other
47
+ )),
48
+ }
49
+ }
50
+
51
+ /// Parse HighlightStyle from string
52
+ fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
53
+ match value.to_lowercase().as_str() {
54
+ "double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
55
+ "html" => Ok(HighlightStyle::Html),
56
+ "bold" => Ok(HighlightStyle::Bold),
57
+ "none" => Ok(HighlightStyle::None),
58
+ other => Err(format!(
59
+ "Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
60
+ other
61
+ )),
62
+ }
63
+ }
64
+
65
+ /// Parse WhitespaceMode from string
66
+ fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
67
+ match value.to_lowercase().as_str() {
68
+ "normalized" => Ok(WhitespaceMode::Normalized),
69
+ "strict" => Ok(WhitespaceMode::Strict),
70
+ other => Err(format!(
71
+ "Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
72
+ other
73
+ )),
74
+ }
75
+ }
76
+
77
+ /// Parse NewlineStyle from string
78
+ fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
79
+ match value.to_lowercase().as_str() {
80
+ "spaces" => Ok(NewlineStyle::Spaces),
81
+ "backslash" => Ok(NewlineStyle::Backslash),
82
+ other => Err(format!(
83
+ "Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
84
+ other
85
+ )),
86
+ }
87
+ }
88
+
89
+ /// Parse CodeBlockStyle from string
90
+ fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
91
+ match value.to_lowercase().as_str() {
92
+ "indented" => Ok(CodeBlockStyle::Indented),
93
+ "backticks" => Ok(CodeBlockStyle::Backticks),
94
+ "tildes" => Ok(CodeBlockStyle::Tildes),
95
+ other => Err(format!(
96
+ "Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
97
+ other
98
+ )),
99
+ }
100
+ }
101
+
102
+ /// Parse PreprocessingPreset from string
103
+ #[allow(dead_code)]
104
+ fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
105
+ match value.to_lowercase().as_str() {
106
+ "minimal" => Ok(PreprocessingPreset::Minimal),
107
+ "standard" => Ok(PreprocessingPreset::Standard),
108
+ "aggressive" => Ok(PreprocessingPreset::Aggressive),
109
+ other => Err(format!(
110
+ "Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
111
+ other
112
+ )),
113
+ }
114
+ }
115
+
116
+ /// Parse HTML conversion options from JSON value
117
+ pub fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
118
+ let mut opts = ConversionOptions::default();
119
+ let obj = value
120
+ .as_object()
121
+ .ok_or_else(|| "html_options must be an object".to_string())?;
122
+
123
+ if let Some(val) = obj.get("heading_style") {
124
+ opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
125
+ }
126
+
127
+ if let Some(val) = obj.get("list_indent_type") {
128
+ opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
129
+ }
130
+
131
+ if let Some(val) = obj.get("list_indent_width") {
132
+ opts.list_indent_width = val
133
+ .as_u64()
134
+ .map(|v| v as usize)
135
+ .ok_or_else(|| "list_indent_width must be an integer".to_string())?;
136
+ }
137
+
138
+ if let Some(val) = obj.get("bullets") {
139
+ opts.bullets = val
140
+ .as_str()
141
+ .map(str::to_string)
142
+ .ok_or_else(|| "bullets must be a string".to_string())?;
143
+ }
144
+
145
+ if let Some(val) = obj.get("strong_em_symbol") {
146
+ let symbol = val
147
+ .as_str()
148
+ .ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
149
+ let mut chars = symbol.chars();
150
+ opts.strong_em_symbol = chars
151
+ .next()
152
+ .ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
153
+ }
154
+
155
+ if let Some(val) = obj.get("escape_asterisks") {
156
+ opts.escape_asterisks = val
157
+ .as_bool()
158
+ .ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
159
+ }
160
+
161
+ if let Some(val) = obj.get("escape_underscores") {
162
+ opts.escape_underscores = val
163
+ .as_bool()
164
+ .ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
165
+ }
166
+
167
+ if let Some(val) = obj.get("escape_misc") {
168
+ opts.escape_misc = val
169
+ .as_bool()
170
+ .ok_or_else(|| "escape_misc must be a boolean".to_string())?;
171
+ }
172
+
173
+ if let Some(val) = obj.get("escape_ascii") {
174
+ opts.escape_ascii = val
175
+ .as_bool()
176
+ .ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
177
+ }
178
+
179
+ if let Some(val) = obj.get("code_language") {
180
+ opts.code_language = val
181
+ .as_str()
182
+ .map(str::to_string)
183
+ .ok_or_else(|| "code_language must be a string".to_string())?;
184
+ }
185
+
186
+ if let Some(val) = obj.get("autolinks") {
187
+ opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
188
+ }
189
+
190
+ if let Some(val) = obj.get("default_title") {
191
+ opts.default_title = val
192
+ .as_bool()
193
+ .ok_or_else(|| "default_title must be a boolean".to_string())?;
194
+ }
195
+
196
+ if let Some(val) = obj.get("br_in_tables") {
197
+ opts.br_in_tables = val
198
+ .as_bool()
199
+ .ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
200
+ }
201
+
202
+ if let Some(val) = obj.get("hocr_spatial_tables") {
203
+ opts.hocr_spatial_tables = val
204
+ .as_bool()
205
+ .ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
206
+ }
207
+
208
+ if let Some(val) = obj.get("highlight_style") {
209
+ opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
210
+ }
211
+
212
+ if let Some(val) = obj.get("extract_metadata") {
213
+ opts.extract_metadata = val
214
+ .as_bool()
215
+ .ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
216
+ }
217
+
218
+ if let Some(val) = obj.get("whitespace_mode") {
219
+ opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
220
+ }
221
+
222
+ if let Some(val) = obj.get("strip_newlines") {
223
+ opts.strip_newlines = val
224
+ .as_bool()
225
+ .ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
226
+ }
227
+
228
+ if let Some(val) = obj.get("wrap") {
229
+ opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
230
+ }
231
+
232
+ if let Some(val) = obj.get("wrap_width") {
233
+ opts.wrap_width = val
234
+ .as_u64()
235
+ .map(|v| v as usize)
236
+ .ok_or_else(|| "wrap_width must be an integer".to_string())?;
237
+ }
238
+
239
+ if let Some(val) = obj.get("convert_as_inline") {
240
+ opts.convert_as_inline = val
241
+ .as_bool()
242
+ .ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
243
+ }
244
+
245
+ if let Some(val) = obj.get("sub_symbol") {
246
+ opts.sub_symbol = val
247
+ .as_str()
248
+ .map(str::to_string)
249
+ .ok_or_else(|| "sub_symbol must be a string".to_string())?;
250
+ }
251
+
252
+ if let Some(val) = obj.get("sup_symbol") {
253
+ opts.sup_symbol = val
254
+ .as_str()
255
+ .map(str::to_string)
256
+ .ok_or_else(|| "sup_symbol must be a string".to_string())?;
257
+ }
258
+
259
+ if let Some(val) = obj.get("newline_style") {
260
+ opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
261
+ }
262
+
263
+ if let Some(val) = obj.get("code_block_style") {
264
+ opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
265
+ }
266
+
267
+ if let Some(val) = obj.get("keep_inline_images_in") {
268
+ opts.keep_inline_images_in = val
269
+ .as_array()
270
+ .ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
271
+ .iter()
272
+ .map(|v| {
273
+ v.as_str()
274
+ .map(str::to_string)
275
+ .ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
276
+ })
277
+ .collect::<FfiResult<Vec<_>>>()?;
278
+ }
279
+
280
+ if let Some(val) = obj.get("encoding") {
281
+ opts.encoding = val
282
+ .as_str()
283
+ .map(str::to_string)
284
+ .ok_or_else(|| "encoding must be a string".to_string())?;
285
+ }
286
+
287
+ if let Some(val) = obj.get("debug") {
288
+ opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
289
+ }
290
+
291
+ if let Some(val) = obj.get("strip_tags") {
292
+ opts.strip_tags = val
293
+ .as_array()
294
+ .ok_or_else(|| "strip_tags must be an array".to_string())?
295
+ .iter()
296
+ .map(|v| {
297
+ v.as_str()
298
+ .map(str::to_string)
299
+ .ok_or_else(|| "strip_tags entries must be strings".to_string())
300
+ })
301
+ .collect::<FfiResult<Vec<_>>>()?;
302
+ }
303
+
304
+ if let Some(val) = obj.get("preserve_tags") {
305
+ opts.preserve_tags = val
306
+ .as_array()
307
+ .ok_or_else(|| "preserve_tags must be an array".to_string())?
308
+ .iter()
309
+ .map(|v| {
310
+ v.as_str()
311
+ .map(str::to_string)
312
+ .ok_or_else(|| "preserve_tags entries must be strings".to_string())
313
+ })
314
+ .collect::<FfiResult<Vec<_>>>()?;
315
+ }
316
+
317
+ Ok(opts)
318
+ }
@@ -0,0 +1,154 @@
1
+ //! Configuration loading from files
2
+ //!
3
+ //! Handles loading ExtractionConfig from TOML/JSON/YAML files and discovery.
4
+
5
+ use crate::helpers::set_last_error;
6
+ use kreuzberg::KreuzbergError;
7
+ use kreuzberg::core::config::ExtractionConfig;
8
+ use std::path::Path;
9
+
10
+ /// Load an ExtractionConfig from a file (returns JSON string).
11
+ ///
12
+ /// # Arguments
13
+ ///
14
+ /// * `file_path` - Path to the configuration file
15
+ ///
16
+ /// # Returns
17
+ ///
18
+ /// JSON string representation of the config, or error message.
19
+ pub fn load_config_as_json(file_path: &str) -> Result<String, String> {
20
+ match ExtractionConfig::from_file(file_path) {
21
+ Ok(config) => match serde_json::to_string(&config) {
22
+ Ok(json) => Ok(json),
23
+ Err(e) => Err(format!("Failed to serialize config to JSON: {}", e)),
24
+ },
25
+ Err(e) => Err(e.to_string()),
26
+ }
27
+ }
28
+
29
+ /// Load an ExtractionConfig from a file (returns config struct).
30
+ ///
31
+ /// # Arguments
32
+ ///
33
+ /// * `path` - Path to the configuration file
34
+ ///
35
+ /// # Returns
36
+ ///
37
+ /// ExtractionConfig on success, or error message.
38
+ pub fn load_config_from_file(path: &Path) -> Result<ExtractionConfig, String> {
39
+ match ExtractionConfig::from_file(path) {
40
+ Ok(config) => Ok(config),
41
+ Err(e) => match &e {
42
+ KreuzbergError::Io(io_err) => Err(format!("IO error loading config: {}", io_err)),
43
+ _ => Err(format!("Failed to load config from file: {}", e)),
44
+ },
45
+ }
46
+ }
47
+
48
+ /// Discover and load an ExtractionConfig (returns JSON string).
49
+ ///
50
+ /// Searches the current directory and all parent directories for:
51
+ /// - `kreuzberg.toml`
52
+ /// - `kreuzberg.json`
53
+ ///
54
+ /// # Returns
55
+ ///
56
+ /// JSON string of the first config file found, or None if not found.
57
+ pub fn discover_config_as_json() -> Option<String> {
58
+ match ExtractionConfig::discover() {
59
+ Ok(Some(config)) => match serde_json::to_string(&config) {
60
+ Ok(json) => Some(json),
61
+ Err(e) => {
62
+ set_last_error(format!("Failed to serialize config: {}", e));
63
+ None
64
+ }
65
+ },
66
+ Ok(None) => None,
67
+ Err(e) => {
68
+ match &e {
69
+ KreuzbergError::Io(io_err) => {
70
+ set_last_error(format!("IO error discovering config: {}", io_err));
71
+ }
72
+ _ => {
73
+ set_last_error(format!("Failed to discover config: {}", e));
74
+ }
75
+ }
76
+ None
77
+ }
78
+ }
79
+ }
80
+
81
+ /// List available embedding preset names.
82
+ ///
83
+ /// # Returns
84
+ ///
85
+ /// JSON array of preset names, or error message.
86
+ pub fn list_embedding_presets() -> Result<String, String> {
87
+ let presets = kreuzberg::embeddings::list_presets();
88
+ match serde_json::to_string(&presets) {
89
+ Ok(json) => Ok(json),
90
+ Err(e) => Err(format!("Failed to serialize presets: {}", e)),
91
+ }
92
+ }
93
+
94
+ /// Get a specific embedding preset by name.
95
+ ///
96
+ /// # Arguments
97
+ ///
98
+ /// * `preset_name` - Name of the preset to retrieve
99
+ ///
100
+ /// # Returns
101
+ ///
102
+ /// JSON representation of the preset, or error message.
103
+ pub fn get_embedding_preset(preset_name: &str) -> Result<String, String> {
104
+ let preset = match kreuzberg::embeddings::get_preset(preset_name) {
105
+ Some(preset) => preset,
106
+ None => {
107
+ return Err(format!("Unknown embedding preset: {}", preset_name));
108
+ }
109
+ };
110
+
111
+ let model_name = format!("{:?}", preset.model);
112
+ let serializable = super::serialize::SerializableEmbeddingPreset {
113
+ name: preset.name,
114
+ chunk_size: preset.chunk_size,
115
+ overlap: preset.overlap,
116
+ model_name,
117
+ dimensions: preset.dimensions,
118
+ description: preset.description,
119
+ };
120
+
121
+ match serde_json::to_string(&serializable) {
122
+ Ok(json) => Ok(json),
123
+ Err(e) => Err(format!("Failed to serialize embedding preset: {}", e)),
124
+ }
125
+ }
126
+
127
+ #[cfg(test)]
128
+ mod tests {
129
+ use super::*;
130
+
131
+ #[test]
132
+ fn test_list_embedding_presets() {
133
+ let result = list_embedding_presets();
134
+ assert!(result.is_ok());
135
+ let json = result.unwrap();
136
+ assert!(json.starts_with('['));
137
+ assert!(json.ends_with(']'));
138
+ }
139
+
140
+ #[test]
141
+ fn test_get_embedding_preset_unknown() {
142
+ let result = get_embedding_preset("nonexistent_preset");
143
+ assert!(result.is_err());
144
+ }
145
+
146
+ #[test]
147
+ fn test_get_embedding_preset_valid() {
148
+ let result = get_embedding_preset("fast");
149
+ assert!(result.is_ok());
150
+ let json = result.unwrap();
151
+ assert!(json.contains("name"));
152
+ assert!(json.contains("chunk_size"));
153
+ }
154
+ }
@@ -0,0 +1,104 @@
1
+ //! Configuration merging logic
2
+ //!
3
+ //! Provides functionality to merge two ExtractionConfig instances.
4
+
5
+ use kreuzberg::core::config::ExtractionConfig;
6
+
7
+ /// Merge two configs (override takes precedence over base).
8
+ ///
9
+ /// Performs a shallow merge where fields from `override_config` take
10
+ /// precedence over fields in `base`. The `base` config is modified in-place.
11
+ ///
12
+ /// # Arguments
13
+ ///
14
+ /// * `base` - Mutable reference to the base config (will be modified)
15
+ /// * `override_config` - Reference to the override config (read-only)
16
+ pub fn merge_configs(base: &mut ExtractionConfig, override_config: &ExtractionConfig) {
17
+ base.use_cache = override_config.use_cache;
18
+ base.enable_quality_processing = override_config.enable_quality_processing;
19
+ base.force_ocr = override_config.force_ocr;
20
+ base.max_concurrent_extractions = override_config.max_concurrent_extractions;
21
+
22
+ if override_config.ocr.is_some() {
23
+ base.ocr = override_config.ocr.clone();
24
+ }
25
+
26
+ if override_config.chunking.is_some() {
27
+ base.chunking = override_config.chunking.clone();
28
+ }
29
+
30
+ if override_config.images.is_some() {
31
+ base.images = override_config.images.clone();
32
+ }
33
+
34
+ #[cfg(feature = "pdf")]
35
+ if override_config.pdf_options.is_some() {
36
+ base.pdf_options = override_config.pdf_options.clone();
37
+ }
38
+
39
+ if override_config.token_reduction.is_some() {
40
+ base.token_reduction = override_config.token_reduction.clone();
41
+ }
42
+
43
+ if override_config.language_detection.is_some() {
44
+ base.language_detection = override_config.language_detection.clone();
45
+ }
46
+
47
+ if override_config.pages.is_some() {
48
+ base.pages = override_config.pages.clone();
49
+ }
50
+
51
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
52
+ if override_config.keywords.is_some() {
53
+ base.keywords = override_config.keywords.clone();
54
+ }
55
+
56
+ if override_config.postprocessor.is_some() {
57
+ base.postprocessor = override_config.postprocessor.clone();
58
+ }
59
+
60
+ if override_config.html_options.is_some() {
61
+ base.html_options = override_config.html_options.clone();
62
+ }
63
+ }
64
+
65
+ #[cfg(test)]
66
+ mod tests {
67
+ use super::*;
68
+
69
+ #[test]
70
+ fn test_merge_configs_simple() {
71
+ let mut base = ExtractionConfig {
72
+ use_cache: true,
73
+ force_ocr: false,
74
+ ..Default::default()
75
+ };
76
+
77
+ let override_config = ExtractionConfig {
78
+ force_ocr: true,
79
+ ..Default::default()
80
+ };
81
+
82
+ merge_configs(&mut base, &override_config);
83
+
84
+ assert!(base.use_cache);
85
+ assert!(base.force_ocr);
86
+ }
87
+
88
+ #[test]
89
+ fn test_merge_configs_override_to_default() {
90
+ let mut base = ExtractionConfig {
91
+ use_cache: false,
92
+ ..Default::default()
93
+ };
94
+
95
+ let override_config = ExtractionConfig {
96
+ use_cache: true,
97
+ ..Default::default()
98
+ };
99
+
100
+ merge_configs(&mut base, &override_config);
101
+
102
+ assert!(base.use_cache, "override to default value should be applied");
103
+ }
104
+ }