kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,34 @@
1
+ //! Metadata handling and document format detection
2
+ //!
3
+ //! Provides utilities for MIME type detection, format validation, and extension mapping.
4
+
5
+ use crate::error_handling::runtime_error;
6
+ use magnus::Error;
7
+
8
+ /// Detect MIME type from bytes
9
+ pub fn detect_mime_type_from_bytes(bytes: String) -> Result<String, Error> {
10
+ let bytes_vec = bytes.into_bytes();
11
+ kreuzberg::core::mime::detect_mime_type_from_bytes(&bytes_vec)
12
+ .map_err(|e| runtime_error(format!("Failed to detect MIME type: {}", e)))
13
+ }
14
+
15
+ /// Detect MIME type from file path
16
+ pub fn detect_mime_type_from_path_native(path: String) -> Result<String, Error> {
17
+ kreuzberg::core::mime::detect_mime_type(&path, true)
18
+ .map_err(|e| runtime_error(format!("Failed to detect MIME type from path: {}", e)))
19
+ }
20
+
21
+ /// Validate MIME type
22
+ pub fn validate_mime_type_native(mime_type: String) -> Result<String, Error> {
23
+ if kreuzberg::core::mime::validate_mime_type(&mime_type).is_ok() {
24
+ Ok(mime_type)
25
+ } else {
26
+ Err(runtime_error(format!("Unsupported MIME type: {}", mime_type)))
27
+ }
28
+ }
29
+
30
+ /// Get file extensions for a given MIME type
31
+ pub fn get_extensions_for_mime_native(mime_type: String) -> Result<Vec<String>, Error> {
32
+ kreuzberg::core::mime::get_extensions_for_mime(&mime_type)
33
+ .map_err(|e| runtime_error(format!("Failed to get extensions: {}", e)))
34
+ }
@@ -0,0 +1,92 @@
1
+ //! Plugin management for Kreuzberg
2
+ //!
3
+ //! Handles registration and management of custom plugins including post-processors,
4
+ //! validators, and OCR backends.
5
+
6
+ pub mod post_processor;
7
+ pub mod validator;
8
+ pub mod ocr_backend;
9
+
10
+ pub use post_processor::register_post_processor;
11
+ pub use validator::register_validator;
12
+ pub use ocr_backend::{register_ocr_backend, unregister_ocr_backend, list_ocr_backends, clear_ocr_backends};
13
+
14
+ // Plugin registry functions
15
+ pub use kreuzberg::get_post_processor_registry;
16
+
17
+ use magnus::Error;
18
+ use kreuzberg::plugins::{
19
+ unregister_validator as kz_unregister_validator,
20
+ clear_validators as kz_clear_validators,
21
+ list_validators as kz_list_validators,
22
+ list_post_processors as kz_list_post_processors,
23
+ list_extractors as kz_list_extractors,
24
+ unregister_extractor as kz_unregister_extractor,
25
+ clear_extractors as kz_clear_extractors,
26
+ };
27
+
28
+ /// Unregister a post-processor plugin by name
29
+ pub fn unregister_post_processor(name: String) -> Result<(), Error> {
30
+ let registry = get_post_processor_registry();
31
+ registry
32
+ .write()
33
+ .map_err(|e| crate::error_handling::runtime_error(format!("Failed to acquire registry lock: {}", e)))?
34
+ .remove(&name)
35
+ .map_err(crate::error_handling::kreuzberg_error)?;
36
+
37
+ Ok(())
38
+ }
39
+
40
+ /// Unregister a validator plugin by name
41
+ pub fn unregister_validator(name: String) -> Result<(), Error> {
42
+ kz_unregister_validator(&name)
43
+ .map_err(crate::error_handling::kreuzberg_error)
44
+ }
45
+
46
+ /// Clear all post-processors
47
+ pub fn clear_post_processors() -> Result<(), Error> {
48
+ let registry = get_post_processor_registry();
49
+ registry
50
+ .write()
51
+ .map_err(|e| crate::error_handling::runtime_error(format!("Failed to acquire registry lock: {}", e)))?
52
+ .shutdown_all()
53
+ .map_err(crate::error_handling::kreuzberg_error)?;
54
+
55
+ Ok(())
56
+ }
57
+
58
+ /// Clear all validators
59
+ pub fn clear_validators() -> Result<(), Error> {
60
+ kz_clear_validators()
61
+ .map_err(crate::error_handling::kreuzberg_error)
62
+ }
63
+
64
+ /// List registered post-processors
65
+ pub fn list_post_processors() -> Result<Vec<String>, Error> {
66
+ kz_list_post_processors()
67
+ .map_err(crate::error_handling::kreuzberg_error)
68
+ }
69
+
70
+ /// List registered validators
71
+ pub fn list_validators() -> Result<Vec<String>, Error> {
72
+ kz_list_validators()
73
+ .map_err(crate::error_handling::kreuzberg_error)
74
+ }
75
+
76
+ /// List registered document extractors
77
+ pub fn list_document_extractors() -> Result<Vec<String>, Error> {
78
+ kz_list_extractors()
79
+ .map_err(crate::error_handling::kreuzberg_error)
80
+ }
81
+
82
+ /// Unregister a document extractor
83
+ pub fn unregister_document_extractor(name: String) -> Result<(), Error> {
84
+ kz_unregister_extractor(&name)
85
+ .map_err(crate::error_handling::kreuzberg_error)
86
+ }
87
+
88
+ /// Clear all document extractors
89
+ pub fn clear_document_extractors() -> Result<(), Error> {
90
+ kz_clear_extractors()
91
+ .map_err(crate::error_handling::kreuzberg_error)
92
+ }
@@ -0,0 +1,159 @@
1
+ //! OCR backend plugin registration and management
2
+
3
+ use crate::error_handling::{kreuzberg_error, runtime_error};
4
+ use crate::gc_guarded_value::GcGuardedValue;
5
+ use magnus::{Error, Ruby, TryConvert, Value};
6
+ use magnus::value::ReprValue;
7
+ use kreuzberg::plugins::{
8
+ register_ocr_backend as kz_register_ocr_backend,
9
+ unregister_ocr_backend as kz_unregister_ocr_backend,
10
+ list_ocr_backends as kz_list_ocr_backends,
11
+ clear_ocr_backends as kz_clear_ocr_backends,
12
+ OcrBackend, OcrBackendType, Plugin,
13
+ };
14
+ use kreuzberg::types::{ExtractionResult, Metadata};
15
+ use kreuzberg::{OcrConfig, KreuzbergError};
16
+ use async_trait::async_trait;
17
+ use std::path::Path;
18
+ use std::sync::Arc;
19
+
20
+ /// Ruby OCR backend wrapper that implements the OcrBackend trait
21
+ struct RubyOcrBackend {
22
+ name: String,
23
+ backend: GcGuardedValue,
24
+ }
25
+
26
+ // SAFETY: Ruby's GC is handled by GcGuardedValue, and we ensure all Ruby
27
+ // calls happen through proper Magnus/Ruby FFI boundaries
28
+ unsafe impl Send for RubyOcrBackend {}
29
+ unsafe impl Sync for RubyOcrBackend {}
30
+
31
+ impl Plugin for RubyOcrBackend {
32
+ fn name(&self) -> &str {
33
+ &self.name
34
+ }
35
+
36
+ fn version(&self) -> String {
37
+ "1.0.0".to_string()
38
+ }
39
+
40
+ fn initialize(&self) -> kreuzberg::Result<()> {
41
+ Ok(())
42
+ }
43
+
44
+ fn shutdown(&self) -> kreuzberg::Result<()> {
45
+ Ok(())
46
+ }
47
+ }
48
+
49
+ #[async_trait]
50
+ impl OcrBackend for RubyOcrBackend {
51
+ async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
52
+ let backend_name = self.name.clone();
53
+ let backend = self.backend.value();
54
+ let image_data = image_bytes.to_vec();
55
+ let ocr_config = config.clone();
56
+
57
+ tokio::task::block_in_place(|| {
58
+ let ruby = Ruby::get().expect("Ruby not initialized");
59
+
60
+ // Convert image bytes to Ruby string (binary)
61
+ let ruby_bytes = ruby.str_from_slice(&image_data);
62
+
63
+ // Convert config to Ruby hash
64
+ let config_hash = ruby.hash_new();
65
+ config_hash.aset("backend", ocr_config.backend.as_str())
66
+ .map_err(|e| KreuzbergError::Plugin {
67
+ message: format!("Failed to set backend in config: {}", e),
68
+ plugin_name: backend_name.clone(),
69
+ })?;
70
+ config_hash.aset("language", ocr_config.language.as_str())
71
+ .map_err(|e| KreuzbergError::Plugin {
72
+ message: format!("Failed to set language in config: {}", e),
73
+ plugin_name: backend_name.clone(),
74
+ })?;
75
+
76
+ // Call Ruby backend's process_image method
77
+ let result: magnus::Value = backend
78
+ .funcall("process_image", (ruby_bytes, config_hash))
79
+ .map_err(|e| KreuzbergError::Plugin {
80
+ message: format!("Ruby OCR backend failed: {}", e),
81
+ plugin_name: backend_name.clone(),
82
+ })?;
83
+
84
+ // Convert result to String
85
+ let content = String::try_convert(result)
86
+ .map_err(|e| KreuzbergError::Plugin {
87
+ message: format!("OCR backend must return a String: {}", e),
88
+ plugin_name: backend_name.clone(),
89
+ })?;
90
+
91
+ Ok(ExtractionResult {
92
+ content,
93
+ mime_type: "text/plain".to_string(),
94
+ metadata: Metadata::default(),
95
+ tables: vec![],
96
+ detected_languages: None,
97
+ chunks: None,
98
+ images: None,
99
+ djot_content: None,
100
+ pages: None,
101
+ elements: None,
102
+ })
103
+ })
104
+ }
105
+
106
+ async fn process_file(&self, path: &Path, config: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
107
+ let bytes = std::fs::read(path)?;
108
+ self.process_image(&bytes, config).await
109
+ }
110
+
111
+ fn supports_language(&self, _lang: &str) -> bool {
112
+ // Ruby backends are assumed to support all languages by default
113
+ // A more sophisticated implementation could call back to Ruby
114
+ true
115
+ }
116
+
117
+ fn backend_type(&self) -> OcrBackendType {
118
+ OcrBackendType::Custom
119
+ }
120
+ }
121
+
122
+ /// Register an OCR backend plugin
123
+ pub fn register_ocr_backend(name: String, backend: Value) -> Result<(), Error> {
124
+ let _ruby = Ruby::get().expect("Ruby not initialized");
125
+
126
+ // Validate that the backend has the required methods
127
+ if !backend.respond_to("name", true)? {
128
+ return Err(runtime_error("OCR backend must implement #name method"));
129
+ }
130
+ if !backend.respond_to("process_image", true)? {
131
+ return Err(runtime_error("OCR backend must implement #process_image(image_bytes, config) method"));
132
+ }
133
+
134
+ let backend_impl = Arc::new(RubyOcrBackend {
135
+ name: name.clone(),
136
+ backend: GcGuardedValue::new(backend),
137
+ });
138
+
139
+ kz_register_ocr_backend(backend_impl)
140
+ .map_err(kreuzberg_error)
141
+ }
142
+
143
+ /// Unregister an OCR backend
144
+ pub fn unregister_ocr_backend(_name: String) -> Result<(), Error> {
145
+ kz_unregister_ocr_backend(_name.as_str())
146
+ .map_err(kreuzberg_error)
147
+ }
148
+
149
+ /// List registered OCR backends
150
+ pub fn list_ocr_backends() -> Result<Vec<String>, Error> {
151
+ kz_list_ocr_backends()
152
+ .map_err(kreuzberg_error)
153
+ }
154
+
155
+ /// Clear all OCR backends
156
+ pub fn clear_ocr_backends() -> Result<(), Error> {
157
+ kz_clear_ocr_backends()
158
+ .map_err(kreuzberg_error)
159
+ }
@@ -0,0 +1,126 @@
1
+ //! Post-processor plugin registration and management
2
+
3
+ use crate::{error_handling::{kreuzberg_error, runtime_error}, gc_guarded_value::GcGuardedValue, helpers::get_kw};
4
+ use magnus::{Error, Ruby, Value, scan_args::scan_args, TryConvert};
5
+ use magnus::value::ReprValue;
6
+ use std::sync::Arc;
7
+
8
+ /// Register a post-processor plugin
9
+ pub fn register_post_processor(args: &[Value]) -> Result<(), Error> {
10
+ let _ruby = Ruby::get().expect("Ruby not initialized");
11
+ let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
12
+ let (name, processor) = args.required;
13
+ let (priority,) = args.optional;
14
+ let priority = priority.unwrap_or(50);
15
+
16
+ if !processor.respond_to("call", true)? {
17
+ return Err(runtime_error("Post-processor must be a Proc or respond to 'call'"));
18
+ }
19
+
20
+ use async_trait::async_trait;
21
+ use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
22
+
23
+ struct RubyPostProcessor {
24
+ name: String,
25
+ processor: GcGuardedValue,
26
+ }
27
+
28
+ unsafe impl Send for RubyPostProcessor {}
29
+ unsafe impl Sync for RubyPostProcessor {}
30
+
31
+ impl Plugin for RubyPostProcessor {
32
+ fn name(&self) -> &str {
33
+ &self.name
34
+ }
35
+
36
+ fn version(&self) -> String {
37
+ "1.0.0".to_string()
38
+ }
39
+
40
+ fn initialize(&self) -> kreuzberg::Result<()> {
41
+ Ok(())
42
+ }
43
+
44
+ fn shutdown(&self) -> kreuzberg::Result<()> {
45
+ Ok(())
46
+ }
47
+ }
48
+
49
+ #[async_trait]
50
+ impl PostProcessor for RubyPostProcessor {
51
+ async fn process(
52
+ &self,
53
+ result: &mut kreuzberg::ExtractionResult,
54
+ _config: &kreuzberg::ExtractionConfig,
55
+ ) -> kreuzberg::Result<()> {
56
+ let processor_name = self.name.clone();
57
+ let processor = self.processor.value();
58
+ let result_clone = result.clone();
59
+
60
+ let updated_result = tokio::task::block_in_place(|| {
61
+ let ruby = Ruby::get().expect("Ruby not initialized");
62
+ let result_hash = crate::result::extraction_result_to_ruby(&ruby, result_clone.clone()).map_err(|e| {
63
+ kreuzberg::KreuzbergError::Plugin {
64
+ message: format!("Failed to convert result to Ruby: {}", e),
65
+ plugin_name: processor_name.clone(),
66
+ }
67
+ })?;
68
+
69
+ let modified = processor
70
+ .funcall::<_, _, magnus::Value>("call", (result_hash,))
71
+ .map_err(|e| kreuzberg::KreuzbergError::Plugin {
72
+ message: format!("Ruby post-processor failed: {}", e),
73
+ plugin_name: processor_name.clone(),
74
+ })?;
75
+
76
+ let modified_hash =
77
+ magnus::RHash::try_convert(modified).map_err(|e| kreuzberg::KreuzbergError::Plugin {
78
+ message: format!("Post-processor must return a Hash: {}", e),
79
+ plugin_name: processor_name.clone(),
80
+ })?;
81
+
82
+ let mut updated_result = result_clone;
83
+
84
+ if let Some(content_val) = get_kw(&ruby, modified_hash, "content") {
85
+ let new_content =
86
+ String::try_convert(content_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
87
+ message: format!("Failed to convert content: {}", e),
88
+ plugin_name: processor_name.clone(),
89
+ })?;
90
+ updated_result.content = new_content;
91
+ }
92
+
93
+ if let Some(mime_val) = get_kw(&ruby, modified_hash, "mime_type") {
94
+ let new_mime = String::try_convert(mime_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
95
+ message: format!("Failed to convert mime_type: {}", e),
96
+ plugin_name: processor_name.clone(),
97
+ })?;
98
+ updated_result.mime_type = new_mime;
99
+ }
100
+
101
+ Ok::<kreuzberg::ExtractionResult, kreuzberg::KreuzbergError>(updated_result)
102
+ })?;
103
+
104
+ *result = updated_result;
105
+ Ok(())
106
+ }
107
+
108
+ fn processing_stage(&self) -> ProcessingStage {
109
+ ProcessingStage::Late
110
+ }
111
+ }
112
+
113
+ let processor_impl = Arc::new(RubyPostProcessor {
114
+ name: name.clone(),
115
+ processor: GcGuardedValue::new(processor),
116
+ });
117
+
118
+ let registry = kreuzberg::get_post_processor_registry();
119
+ registry
120
+ .write()
121
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
122
+ .register(processor_impl, priority)
123
+ .map_err(kreuzberg_error)?;
124
+
125
+ Ok(())
126
+ }
@@ -0,0 +1,99 @@
1
+ //! Validator plugin registration and management
2
+
3
+ use crate::{error_handling::{kreuzberg_error, runtime_error}, gc_guarded_value::GcGuardedValue};
4
+ use magnus::{Error, Value, scan_args::scan_args, Ruby};
5
+ use magnus::value::ReprValue;
6
+ use std::sync::Arc;
7
+
8
+ /// Register a validator plugin
9
+ pub fn register_validator(args: &[Value]) -> Result<(), Error> {
10
+ let _ruby = Ruby::get().expect("Ruby not initialized");
11
+ let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
12
+ let (name, validator) = args.required;
13
+ let (priority,) = args.optional;
14
+ let priority = priority.unwrap_or(50);
15
+
16
+ if !validator.respond_to("call", true)? {
17
+ return Err(runtime_error("Validator must be a Proc or respond to 'call'"));
18
+ }
19
+
20
+ use async_trait::async_trait;
21
+ use kreuzberg::plugins::{Plugin, Validator};
22
+
23
+ struct RubyValidator {
24
+ name: String,
25
+ validator: GcGuardedValue,
26
+ priority: i32,
27
+ }
28
+
29
+ unsafe impl Send for RubyValidator {}
30
+ unsafe impl Sync for RubyValidator {}
31
+
32
+ impl Plugin for RubyValidator {
33
+ fn name(&self) -> &str {
34
+ &self.name
35
+ }
36
+
37
+ fn version(&self) -> String {
38
+ "1.0.0".to_string()
39
+ }
40
+
41
+ fn initialize(&self) -> kreuzberg::Result<()> {
42
+ Ok(())
43
+ }
44
+
45
+ fn shutdown(&self) -> kreuzberg::Result<()> {
46
+ Ok(())
47
+ }
48
+ }
49
+
50
+ #[async_trait]
51
+ impl Validator for RubyValidator {
52
+ async fn validate(
53
+ &self,
54
+ result: &kreuzberg::ExtractionResult,
55
+ _config: &kreuzberg::ExtractionConfig,
56
+ ) -> kreuzberg::Result<()> {
57
+ let validator_name = self.name.clone();
58
+ let validator = self.validator.value();
59
+ let result_clone = result.clone();
60
+
61
+ tokio::task::block_in_place(|| {
62
+ let ruby = Ruby::get().expect("Ruby not initialized");
63
+ let result_hash =
64
+ crate::result::extraction_result_to_ruby(&ruby, result_clone).map_err(|e| kreuzberg::KreuzbergError::Plugin {
65
+ message: format!("Failed to convert result to Ruby: {}", e),
66
+ plugin_name: validator_name.clone(),
67
+ })?;
68
+
69
+ validator
70
+ .funcall::<_, _, magnus::Value>("call", (result_hash,))
71
+ .map_err(|e| kreuzberg::KreuzbergError::Validation {
72
+ message: format!("Validation failed: {}", e),
73
+ source: None,
74
+ })?;
75
+
76
+ Ok(())
77
+ })
78
+ }
79
+
80
+ fn priority(&self) -> i32 {
81
+ self.priority
82
+ }
83
+ }
84
+
85
+ let validator_impl = Arc::new(RubyValidator {
86
+ name: name.clone(),
87
+ validator: GcGuardedValue::new(validator),
88
+ priority,
89
+ });
90
+
91
+ let registry = kreuzberg::get_validator_registry();
92
+ registry
93
+ .write()
94
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
95
+ .register(validator_impl)
96
+ .map_err(kreuzberg_error)?;
97
+
98
+ Ok(())
99
+ }