kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,34 @@
1
+ //! Metadata handling and document format detection
2
+ //!
3
+ //! Provides utilities for MIME type detection, format validation, and extension mapping.
4
+
5
+ use crate::error_handling::runtime_error;
6
+ use magnus::Error;
7
+
8
+ /// Detect MIME type from bytes
9
+ pub fn detect_mime_type_from_bytes(bytes: String) -> Result<String, Error> {
10
+ let bytes_vec = bytes.into_bytes();
11
+ kreuzberg::core::mime::detect_mime_type_from_bytes(&bytes_vec)
12
+ .map_err(|e| runtime_error(format!("Failed to detect MIME type: {}", e)))
13
+ }
14
+
15
+ /// Detect MIME type from file path
16
+ pub fn detect_mime_type_from_path_native(path: String) -> Result<String, Error> {
17
+ kreuzberg::core::mime::detect_mime_type(&path, true)
18
+ .map_err(|e| runtime_error(format!("Failed to detect MIME type from path: {}", e)))
19
+ }
20
+
21
+ /// Validate MIME type
22
+ pub fn validate_mime_type_native(mime_type: String) -> Result<String, Error> {
23
+ if kreuzberg::core::mime::validate_mime_type(&mime_type).is_ok() {
24
+ Ok(mime_type)
25
+ } else {
26
+ Err(runtime_error(format!("Unsupported MIME type: {}", mime_type)))
27
+ }
28
+ }
29
+
30
+ /// Get file extensions for a given MIME type
31
+ pub fn get_extensions_for_mime_native(mime_type: String) -> Result<Vec<String>, Error> {
32
+ kreuzberg::core::mime::get_extensions_for_mime(&mime_type)
33
+ .map_err(|e| runtime_error(format!("Failed to get extensions: {}", e)))
34
+ }
@@ -0,0 +1,92 @@
1
+ //! Plugin management for Kreuzberg
2
+ //!
3
+ //! Handles registration and management of custom plugins including post-processors,
4
+ //! validators, and OCR backends.
5
+
6
+ pub mod post_processor;
7
+ pub mod validator;
8
+ pub mod ocr_backend;
9
+
10
+ pub use post_processor::register_post_processor;
11
+ pub use validator::register_validator;
12
+ pub use ocr_backend::{register_ocr_backend, unregister_ocr_backend, list_ocr_backends, clear_ocr_backends};
13
+
14
+ // Plugin registry functions
15
+ pub use kreuzberg::get_post_processor_registry;
16
+
17
+ use magnus::Error;
18
+ use kreuzberg::plugins::{
19
+ unregister_validator as kz_unregister_validator,
20
+ clear_validators as kz_clear_validators,
21
+ list_validators as kz_list_validators,
22
+ list_post_processors as kz_list_post_processors,
23
+ list_extractors as kz_list_extractors,
24
+ unregister_extractor as kz_unregister_extractor,
25
+ clear_extractors as kz_clear_extractors,
26
+ };
27
+
28
+ /// Unregister a post-processor plugin by name
29
+ pub fn unregister_post_processor(name: String) -> Result<(), Error> {
30
+ let registry = get_post_processor_registry();
31
+ registry
32
+ .write()
33
+ .map_err(|e| crate::error_handling::runtime_error(format!("Failed to acquire registry lock: {}", e)))?
34
+ .remove(&name)
35
+ .map_err(crate::error_handling::kreuzberg_error)?;
36
+
37
+ Ok(())
38
+ }
39
+
40
+ /// Unregister a validator plugin by name
41
+ pub fn unregister_validator(name: String) -> Result<(), Error> {
42
+ kz_unregister_validator(&name)
43
+ .map_err(crate::error_handling::kreuzberg_error)
44
+ }
45
+
46
+ /// Clear all post-processors
47
+ pub fn clear_post_processors() -> Result<(), Error> {
48
+ let registry = get_post_processor_registry();
49
+ registry
50
+ .write()
51
+ .map_err(|e| crate::error_handling::runtime_error(format!("Failed to acquire registry lock: {}", e)))?
52
+ .shutdown_all()
53
+ .map_err(crate::error_handling::kreuzberg_error)?;
54
+
55
+ Ok(())
56
+ }
57
+
58
+ /// Clear all validators
59
+ pub fn clear_validators() -> Result<(), Error> {
60
+ kz_clear_validators()
61
+ .map_err(crate::error_handling::kreuzberg_error)
62
+ }
63
+
64
+ /// List registered post-processors
65
+ pub fn list_post_processors() -> Result<Vec<String>, Error> {
66
+ kz_list_post_processors()
67
+ .map_err(crate::error_handling::kreuzberg_error)
68
+ }
69
+
70
+ /// List registered validators
71
+ pub fn list_validators() -> Result<Vec<String>, Error> {
72
+ kz_list_validators()
73
+ .map_err(crate::error_handling::kreuzberg_error)
74
+ }
75
+
76
+ /// List registered document extractors
77
+ pub fn list_document_extractors() -> Result<Vec<String>, Error> {
78
+ kz_list_extractors()
79
+ .map_err(crate::error_handling::kreuzberg_error)
80
+ }
81
+
82
+ /// Unregister a document extractor
83
+ pub fn unregister_document_extractor(name: String) -> Result<(), Error> {
84
+ kz_unregister_extractor(&name)
85
+ .map_err(crate::error_handling::kreuzberg_error)
86
+ }
87
+
88
+ /// Clear all document extractors
89
+ pub fn clear_document_extractors() -> Result<(), Error> {
90
+ kz_clear_extractors()
91
+ .map_err(crate::error_handling::kreuzberg_error)
92
+ }
@@ -0,0 +1,159 @@
1
+ //! OCR backend plugin registration and management
2
+
3
+ use crate::error_handling::{kreuzberg_error, runtime_error};
4
+ use crate::gc_guarded_value::GcGuardedValue;
5
+ use magnus::{Error, Ruby, TryConvert, Value};
6
+ use magnus::value::ReprValue;
7
+ use kreuzberg::plugins::{
8
+ register_ocr_backend as kz_register_ocr_backend,
9
+ unregister_ocr_backend as kz_unregister_ocr_backend,
10
+ list_ocr_backends as kz_list_ocr_backends,
11
+ clear_ocr_backends as kz_clear_ocr_backends,
12
+ OcrBackend, OcrBackendType, Plugin,
13
+ };
14
+ use kreuzberg::types::{ExtractionResult, Metadata};
15
+ use kreuzberg::{OcrConfig, KreuzbergError};
16
+ use async_trait::async_trait;
17
+ use std::path::Path;
18
+ use std::sync::Arc;
19
+
20
+ /// Ruby OCR backend wrapper that implements the OcrBackend trait
21
+ struct RubyOcrBackend {
22
+ name: String,
23
+ backend: GcGuardedValue,
24
+ }
25
+
26
+ // SAFETY: Ruby's GC is handled by GcGuardedValue, and we ensure all Ruby
27
+ // calls happen through proper Magnus/Ruby FFI boundaries
28
+ unsafe impl Send for RubyOcrBackend {}
29
+ unsafe impl Sync for RubyOcrBackend {}
30
+
31
+ impl Plugin for RubyOcrBackend {
32
+ fn name(&self) -> &str {
33
+ &self.name
34
+ }
35
+
36
+ fn version(&self) -> String {
37
+ "1.0.0".to_string()
38
+ }
39
+
40
+ fn initialize(&self) -> kreuzberg::Result<()> {
41
+ Ok(())
42
+ }
43
+
44
+ fn shutdown(&self) -> kreuzberg::Result<()> {
45
+ Ok(())
46
+ }
47
+ }
48
+
49
+ #[async_trait]
50
+ impl OcrBackend for RubyOcrBackend {
51
+ async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
52
+ let backend_name = self.name.clone();
53
+ let backend = self.backend.value();
54
+ let image_data = image_bytes.to_vec();
55
+ let ocr_config = config.clone();
56
+
57
+ tokio::task::block_in_place(|| {
58
+ let ruby = Ruby::get().expect("Ruby not initialized");
59
+
60
+ // Convert image bytes to Ruby string (binary)
61
+ let ruby_bytes = ruby.str_from_slice(&image_data);
62
+
63
+ // Convert config to Ruby hash
64
+ let config_hash = ruby.hash_new();
65
+ config_hash.aset("backend", ocr_config.backend.as_str())
66
+ .map_err(|e| KreuzbergError::Plugin {
67
+ message: format!("Failed to set backend in config: {}", e),
68
+ plugin_name: backend_name.clone(),
69
+ })?;
70
+ config_hash.aset("language", ocr_config.language.as_str())
71
+ .map_err(|e| KreuzbergError::Plugin {
72
+ message: format!("Failed to set language in config: {}", e),
73
+ plugin_name: backend_name.clone(),
74
+ })?;
75
+
76
+ // Call Ruby backend's process_image method
77
+ let result: magnus::Value = backend
78
+ .funcall("process_image", (ruby_bytes, config_hash))
79
+ .map_err(|e| KreuzbergError::Plugin {
80
+ message: format!("Ruby OCR backend failed: {}", e),
81
+ plugin_name: backend_name.clone(),
82
+ })?;
83
+
84
+ // Convert result to String
85
+ let content = String::try_convert(result)
86
+ .map_err(|e| KreuzbergError::Plugin {
87
+ message: format!("OCR backend must return a String: {}", e),
88
+ plugin_name: backend_name.clone(),
89
+ })?;
90
+
91
+ Ok(ExtractionResult {
92
+ content,
93
+ mime_type: "text/plain".to_string(),
94
+ metadata: Metadata::default(),
95
+ tables: vec![],
96
+ detected_languages: None,
97
+ chunks: None,
98
+ images: None,
99
+ djot_content: None,
100
+ pages: None,
101
+ elements: None,
102
+ })
103
+ })
104
+ }
105
+
106
+ async fn process_file(&self, path: &Path, config: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
107
+ let bytes = std::fs::read(path)?;
108
+ self.process_image(&bytes, config).await
109
+ }
110
+
111
+ fn supports_language(&self, _lang: &str) -> bool {
112
+ // Ruby backends are assumed to support all languages by default
113
+ // A more sophisticated implementation could call back to Ruby
114
+ true
115
+ }
116
+
117
+ fn backend_type(&self) -> OcrBackendType {
118
+ OcrBackendType::Custom
119
+ }
120
+ }
121
+
122
+ /// Register an OCR backend plugin
123
+ pub fn register_ocr_backend(name: String, backend: Value) -> Result<(), Error> {
124
+ let _ruby = Ruby::get().expect("Ruby not initialized");
125
+
126
+ // Validate that the backend has the required methods
127
+ if !backend.respond_to("name", true)? {
128
+ return Err(runtime_error("OCR backend must implement #name method"));
129
+ }
130
+ if !backend.respond_to("process_image", true)? {
131
+ return Err(runtime_error("OCR backend must implement #process_image(image_bytes, config) method"));
132
+ }
133
+
134
+ let backend_impl = Arc::new(RubyOcrBackend {
135
+ name: name.clone(),
136
+ backend: GcGuardedValue::new(backend),
137
+ });
138
+
139
+ kz_register_ocr_backend(backend_impl)
140
+ .map_err(kreuzberg_error)
141
+ }
142
+
143
+ /// Unregister an OCR backend
144
+ pub fn unregister_ocr_backend(_name: String) -> Result<(), Error> {
145
+ kz_unregister_ocr_backend(_name.as_str())
146
+ .map_err(kreuzberg_error)
147
+ }
148
+
149
+ /// List registered OCR backends
150
+ pub fn list_ocr_backends() -> Result<Vec<String>, Error> {
151
+ kz_list_ocr_backends()
152
+ .map_err(kreuzberg_error)
153
+ }
154
+
155
+ /// Clear all OCR backends
156
+ pub fn clear_ocr_backends() -> Result<(), Error> {
157
+ kz_clear_ocr_backends()
158
+ .map_err(kreuzberg_error)
159
+ }
@@ -0,0 +1,126 @@
1
+ //! Post-processor plugin registration and management
2
+
3
+ use crate::{error_handling::{kreuzberg_error, runtime_error}, gc_guarded_value::GcGuardedValue, helpers::get_kw};
4
+ use magnus::{Error, Ruby, Value, scan_args::scan_args, TryConvert};
5
+ use magnus::value::ReprValue;
6
+ use std::sync::Arc;
7
+
8
+ /// Register a post-processor plugin
9
+ pub fn register_post_processor(args: &[Value]) -> Result<(), Error> {
10
+ let _ruby = Ruby::get().expect("Ruby not initialized");
11
+ let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
12
+ let (name, processor) = args.required;
13
+ let (priority,) = args.optional;
14
+ let priority = priority.unwrap_or(50);
15
+
16
+ if !processor.respond_to("call", true)? {
17
+ return Err(runtime_error("Post-processor must be a Proc or respond to 'call'"));
18
+ }
19
+
20
+ use async_trait::async_trait;
21
+ use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
22
+
23
+ struct RubyPostProcessor {
24
+ name: String,
25
+ processor: GcGuardedValue,
26
+ }
27
+
28
+ unsafe impl Send for RubyPostProcessor {}
29
+ unsafe impl Sync for RubyPostProcessor {}
30
+
31
+ impl Plugin for RubyPostProcessor {
32
+ fn name(&self) -> &str {
33
+ &self.name
34
+ }
35
+
36
+ fn version(&self) -> String {
37
+ "1.0.0".to_string()
38
+ }
39
+
40
+ fn initialize(&self) -> kreuzberg::Result<()> {
41
+ Ok(())
42
+ }
43
+
44
+ fn shutdown(&self) -> kreuzberg::Result<()> {
45
+ Ok(())
46
+ }
47
+ }
48
+
49
+ #[async_trait]
50
+ impl PostProcessor for RubyPostProcessor {
51
+ async fn process(
52
+ &self,
53
+ result: &mut kreuzberg::ExtractionResult,
54
+ _config: &kreuzberg::ExtractionConfig,
55
+ ) -> kreuzberg::Result<()> {
56
+ let processor_name = self.name.clone();
57
+ let processor = self.processor.value();
58
+ let result_clone = result.clone();
59
+
60
+ let updated_result = tokio::task::block_in_place(|| {
61
+ let ruby = Ruby::get().expect("Ruby not initialized");
62
+ let result_hash = crate::result::extraction_result_to_ruby(&ruby, result_clone.clone()).map_err(|e| {
63
+ kreuzberg::KreuzbergError::Plugin {
64
+ message: format!("Failed to convert result to Ruby: {}", e),
65
+ plugin_name: processor_name.clone(),
66
+ }
67
+ })?;
68
+
69
+ let modified = processor
70
+ .funcall::<_, _, magnus::Value>("call", (result_hash,))
71
+ .map_err(|e| kreuzberg::KreuzbergError::Plugin {
72
+ message: format!("Ruby post-processor failed: {}", e),
73
+ plugin_name: processor_name.clone(),
74
+ })?;
75
+
76
+ let modified_hash =
77
+ magnus::RHash::try_convert(modified).map_err(|e| kreuzberg::KreuzbergError::Plugin {
78
+ message: format!("Post-processor must return a Hash: {}", e),
79
+ plugin_name: processor_name.clone(),
80
+ })?;
81
+
82
+ let mut updated_result = result_clone;
83
+
84
+ if let Some(content_val) = get_kw(&ruby, modified_hash, "content") {
85
+ let new_content =
86
+ String::try_convert(content_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
87
+ message: format!("Failed to convert content: {}", e),
88
+ plugin_name: processor_name.clone(),
89
+ })?;
90
+ updated_result.content = new_content;
91
+ }
92
+
93
+ if let Some(mime_val) = get_kw(&ruby, modified_hash, "mime_type") {
94
+ let new_mime = String::try_convert(mime_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
95
+ message: format!("Failed to convert mime_type: {}", e),
96
+ plugin_name: processor_name.clone(),
97
+ })?;
98
+ updated_result.mime_type = new_mime;
99
+ }
100
+
101
+ Ok::<kreuzberg::ExtractionResult, kreuzberg::KreuzbergError>(updated_result)
102
+ })?;
103
+
104
+ *result = updated_result;
105
+ Ok(())
106
+ }
107
+
108
+ fn processing_stage(&self) -> ProcessingStage {
109
+ ProcessingStage::Late
110
+ }
111
+ }
112
+
113
+ let processor_impl = Arc::new(RubyPostProcessor {
114
+ name: name.clone(),
115
+ processor: GcGuardedValue::new(processor),
116
+ });
117
+
118
+ let registry = kreuzberg::get_post_processor_registry();
119
+ registry
120
+ .write()
121
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
122
+ .register(processor_impl, priority)
123
+ .map_err(kreuzberg_error)?;
124
+
125
+ Ok(())
126
+ }
@@ -0,0 +1,99 @@
1
+ //! Validator plugin registration and management
2
+
3
+ use crate::{error_handling::{kreuzberg_error, runtime_error}, gc_guarded_value::GcGuardedValue};
4
+ use magnus::{Error, Value, scan_args::scan_args, Ruby};
5
+ use magnus::value::ReprValue;
6
+ use std::sync::Arc;
7
+
8
+ /// Register a validator plugin
9
+ pub fn register_validator(args: &[Value]) -> Result<(), Error> {
10
+ let _ruby = Ruby::get().expect("Ruby not initialized");
11
+ let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
12
+ let (name, validator) = args.required;
13
+ let (priority,) = args.optional;
14
+ let priority = priority.unwrap_or(50);
15
+
16
+ if !validator.respond_to("call", true)? {
17
+ return Err(runtime_error("Validator must be a Proc or respond to 'call'"));
18
+ }
19
+
20
+ use async_trait::async_trait;
21
+ use kreuzberg::plugins::{Plugin, Validator};
22
+
23
+ struct RubyValidator {
24
+ name: String,
25
+ validator: GcGuardedValue,
26
+ priority: i32,
27
+ }
28
+
29
+ unsafe impl Send for RubyValidator {}
30
+ unsafe impl Sync for RubyValidator {}
31
+
32
+ impl Plugin for RubyValidator {
33
+ fn name(&self) -> &str {
34
+ &self.name
35
+ }
36
+
37
+ fn version(&self) -> String {
38
+ "1.0.0".to_string()
39
+ }
40
+
41
+ fn initialize(&self) -> kreuzberg::Result<()> {
42
+ Ok(())
43
+ }
44
+
45
+ fn shutdown(&self) -> kreuzberg::Result<()> {
46
+ Ok(())
47
+ }
48
+ }
49
+
50
+ #[async_trait]
51
+ impl Validator for RubyValidator {
52
+ async fn validate(
53
+ &self,
54
+ result: &kreuzberg::ExtractionResult,
55
+ _config: &kreuzberg::ExtractionConfig,
56
+ ) -> kreuzberg::Result<()> {
57
+ let validator_name = self.name.clone();
58
+ let validator = self.validator.value();
59
+ let result_clone = result.clone();
60
+
61
+ tokio::task::block_in_place(|| {
62
+ let ruby = Ruby::get().expect("Ruby not initialized");
63
+ let result_hash =
64
+ crate::result::extraction_result_to_ruby(&ruby, result_clone).map_err(|e| kreuzberg::KreuzbergError::Plugin {
65
+ message: format!("Failed to convert result to Ruby: {}", e),
66
+ plugin_name: validator_name.clone(),
67
+ })?;
68
+
69
+ validator
70
+ .funcall::<_, _, magnus::Value>("call", (result_hash,))
71
+ .map_err(|e| kreuzberg::KreuzbergError::Validation {
72
+ message: format!("Validation failed: {}", e),
73
+ source: None,
74
+ })?;
75
+
76
+ Ok(())
77
+ })
78
+ }
79
+
80
+ fn priority(&self) -> i32 {
81
+ self.priority
82
+ }
83
+ }
84
+
85
+ let validator_impl = Arc::new(RubyValidator {
86
+ name: name.clone(),
87
+ validator: GcGuardedValue::new(validator),
88
+ priority,
89
+ });
90
+
91
+ let registry = kreuzberg::get_validator_registry();
92
+ registry
93
+ .write()
94
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
95
+ .register(validator_impl)
96
+ .map_err(kreuzberg_error)?;
97
+
98
+ Ok(())
99
+ }