kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,125 @@
1
+ //! Error handling and conversion to Ruby exceptions
2
+ //!
3
+ //! Provides error conversion from Kreuzberg errors to Magnus Ruby exceptions,
4
+ //! panic context retrieval, and error code utilities.
5
+
6
+ use kreuzberg::KreuzbergError;
7
+ use magnus::{Error, exception::ExceptionClass, Ruby};
8
+ use std::ffi::CStr;
9
+
10
+ pub use kreuzberg_ffi::{
11
+ kreuzberg_free_string, kreuzberg_last_error_code,
12
+ kreuzberg_last_panic_context,
13
+ };
14
+
15
+ /// Retrieve panic context from FFI if available
16
+ pub fn get_panic_context() -> Option<String> {
17
+ unsafe {
18
+ let ctx_ptr = kreuzberg_last_panic_context();
19
+ if ctx_ptr.is_null() {
20
+ return None;
21
+ }
22
+
23
+ let c_str = CStr::from_ptr(ctx_ptr);
24
+ let context = c_str.to_string_lossy().to_string();
25
+ kreuzberg_free_string(ctx_ptr as *mut std::ffi::c_char);
26
+
27
+ if context.is_empty() { None } else { Some(context) }
28
+ }
29
+ }
30
+
31
+ /// Retrieve error code from FFI
32
+ pub fn get_error_code() -> i32 {
33
+ unsafe { kreuzberg_last_error_code() }
34
+ }
35
+
36
+ /// Convert Kreuzberg errors to Ruby exceptions
37
+ pub fn kreuzberg_error(err: KreuzbergError) -> Error {
38
+ let ruby = Ruby::get().expect("Ruby not initialized");
39
+
40
+ let fetch_error_class = |name: &str| -> Option<ExceptionClass> {
41
+ ruby.eval::<ExceptionClass>(&format!("Kreuzberg::Errors::{}", name))
42
+ .ok()
43
+ };
44
+
45
+ match err {
46
+ KreuzbergError::Validation { message, .. } => {
47
+ if let Some(class) = fetch_error_class("ValidationError") {
48
+ Error::new(class, message)
49
+ } else {
50
+ Error::new(ruby.exception_arg_error(), message)
51
+ }
52
+ }
53
+ KreuzbergError::Parsing { message, .. } => {
54
+ if let Some(class) = fetch_error_class("ParsingError") {
55
+ Error::new(class, message)
56
+ } else {
57
+ Error::new(ruby.exception_runtime_error(), format!("ParsingError: {}", message))
58
+ }
59
+ }
60
+ KreuzbergError::Ocr { message, .. } => {
61
+ if let Some(class) = fetch_error_class("OCRError") {
62
+ Error::new(class, message)
63
+ } else {
64
+ Error::new(ruby.exception_runtime_error(), format!("OCRError: {}", message))
65
+ }
66
+ }
67
+ KreuzbergError::MissingDependency(message) => {
68
+ if let Some(class) = fetch_error_class("MissingDependencyError") {
69
+ Error::new(class, message)
70
+ } else {
71
+ Error::new(
72
+ ruby.exception_runtime_error(),
73
+ format!("MissingDependencyError: {}", message),
74
+ )
75
+ }
76
+ }
77
+ KreuzbergError::Plugin { message, plugin_name } => {
78
+ if let Some(class) = fetch_error_class("PluginError") {
79
+ Error::new(class, format!("{}: {}", plugin_name, message))
80
+ } else {
81
+ Error::new(
82
+ ruby.exception_runtime_error(),
83
+ format!("Plugin error in '{}': {}", plugin_name, message),
84
+ )
85
+ }
86
+ }
87
+ KreuzbergError::Io(err) => {
88
+ if let Some(class) = fetch_error_class("IOError") {
89
+ Error::new(class, err.to_string())
90
+ } else {
91
+ Error::new(ruby.exception_runtime_error(), format!("IO error: {}", err))
92
+ }
93
+ }
94
+ KreuzbergError::UnsupportedFormat(message) => {
95
+ if let Some(class) = fetch_error_class("UnsupportedFormatError") {
96
+ Error::new(class, message)
97
+ } else {
98
+ Error::new(
99
+ ruby.exception_runtime_error(),
100
+ format!("UnsupportedFormatError: {}", message),
101
+ )
102
+ }
103
+ }
104
+ other => Error::new(ruby.exception_runtime_error(), other.to_string()),
105
+ }
106
+ }
107
+
108
+ /// Create a generic runtime error
109
+ pub fn runtime_error(message: impl Into<String>) -> Error {
110
+ let ruby = Ruby::get().expect("Ruby not initialized");
111
+ Error::new(ruby.exception_runtime_error(), message.into())
112
+ }
113
+
114
+ /// Create a validation error (Kreuzberg::Errors::ValidationError)
115
+ pub fn validation_error(message: impl Into<String>) -> Error {
116
+ let ruby = Ruby::get().expect("Ruby not initialized");
117
+
118
+ // Try to get the ValidationError class from Ruby
119
+ if let Ok(class) = ruby.eval::<ExceptionClass>("Kreuzberg::Errors::ValidationError") {
120
+ Error::new(class, message.into())
121
+ } else {
122
+ // Fall back to ArgumentError if the class doesn't exist
123
+ Error::new(ruby.exception_arg_error(), message.into())
124
+ }
125
+ }
@@ -0,0 +1,79 @@
1
+ //! File extraction functions
2
+ //!
3
+ //! Handles extraction from files and byte arrays (synchronous and asynchronous).
4
+
5
+ use crate::config::parse_extraction_config;
6
+ use crate::error_handling::kreuzberg_error;
7
+ use crate::result::extraction_result_to_ruby;
8
+
9
+ use magnus::{Error, RHash, RString, Ruby, Value, scan_args::scan_args};
10
+
11
+ /// Extract content from a file (synchronous)
12
+ pub fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
13
+ let ruby = Ruby::get().expect("Ruby not initialized");
14
+ let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
15
+ let (path,) = args.required;
16
+ let (mime_type,) = args.optional;
17
+ let opts = Some(args.keywords);
18
+
19
+ let config = parse_extraction_config(&ruby, opts)?;
20
+
21
+ let result = kreuzberg::extract_file_sync(&path, mime_type.as_deref(), &config).map_err(kreuzberg_error)?;
22
+
23
+ extraction_result_to_ruby(&ruby, result)
24
+ }
25
+
26
+ /// Extract content from bytes (synchronous)
27
+ pub fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
28
+ let ruby = Ruby::get().expect("Ruby not initialized");
29
+ let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
30
+ let (data, mime_type) = args.required;
31
+ let opts = Some(args.keywords);
32
+
33
+ let config = parse_extraction_config(&ruby, opts)?;
34
+
35
+ let bytes = unsafe { data.as_slice() };
36
+ let result = kreuzberg::extract_bytes_sync(bytes, &mime_type, &config).map_err(kreuzberg_error)?;
37
+
38
+ extraction_result_to_ruby(&ruby, result)
39
+ }
40
+
41
+ /// Extract content from a file (asynchronous)
42
+ pub fn extract_file(args: &[Value]) -> Result<RHash, Error> {
43
+ let ruby = Ruby::get().expect("Ruby not initialized");
44
+ let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
45
+ let (path,) = args.required;
46
+ let (mime_type,) = args.optional;
47
+ let opts = Some(args.keywords);
48
+
49
+ let config = parse_extraction_config(&ruby, opts)?;
50
+
51
+ let runtime =
52
+ tokio::runtime::Runtime::new().map_err(|e| crate::error_handling::runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
53
+
54
+ let result = runtime
55
+ .block_on(async { kreuzberg::extract_file(&path, mime_type.as_deref(), &config).await })
56
+ .map_err(kreuzberg_error)?;
57
+
58
+ extraction_result_to_ruby(&ruby, result)
59
+ }
60
+
61
+ /// Extract content from bytes (asynchronous)
62
+ pub fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
63
+ let ruby = Ruby::get().expect("Ruby not initialized");
64
+ let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
65
+ let (data, mime_type) = args.required;
66
+ let opts = Some(args.keywords);
67
+
68
+ let config = parse_extraction_config(&ruby, opts)?;
69
+
70
+ let runtime =
71
+ tokio::runtime::Runtime::new().map_err(|e| crate::error_handling::runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
72
+
73
+ let bytes = unsafe { data.as_slice() };
74
+ let result = runtime
75
+ .block_on(async { kreuzberg::extract_bytes(bytes, &mime_type, &config).await })
76
+ .map_err(kreuzberg_error)?;
77
+
78
+ extraction_result_to_ruby(&ruby, result)
79
+ }
@@ -0,0 +1,35 @@
1
+ //! GC-guarded Ruby value wrapper for plugin registrations
2
+ //!
3
+ //! Keeps Ruby values alive across plugin registrations by informing the Ruby GC.
4
+
5
+ use magnus::{Ruby, Value};
6
+
7
+ /// Keeps Ruby values alive across plugin registrations by informing the GC.
8
+ ///
9
+ /// This prevents Ruby objects (like Procs) from being garbage collected while
10
+ /// they're being used as plugin callbacks.
11
+ pub struct GcGuardedValue {
12
+ value: Value,
13
+ }
14
+
15
+ impl GcGuardedValue {
16
+ /// Create a new GC-guarded value
17
+ pub fn new(value: Value) -> Self {
18
+ let ruby = Ruby::get().expect("Ruby not initialized");
19
+ ruby.gc_register_address(&value);
20
+ Self { value }
21
+ }
22
+
23
+ /// Get the wrapped value
24
+ pub fn value(&self) -> Value {
25
+ self.value
26
+ }
27
+ }
28
+
29
+ impl Drop for GcGuardedValue {
30
+ fn drop(&mut self) {
31
+ if let Ok(ruby) = Ruby::get() {
32
+ ruby.gc_unregister_address(&self.value);
33
+ }
34
+ }
35
+ }
@@ -0,0 +1,176 @@
1
+ //! Helper utilities for Ruby value conversion and manipulation
2
+ //!
3
+ //! Provides utilities for converting between Ruby and JSON values,
4
+ //! accessing keyword arguments, and managing cache directories.
5
+
6
+ use magnus::{Error, RArray, RHash, Ruby, Symbol, Value, TryConvert, IntoValue};
7
+ use magnus::value::ReprValue;
8
+ use std::fs;
9
+ use std::path::{Path, PathBuf};
10
+
11
+ use crate::error_handling::runtime_error;
12
+
13
+ /// Convert Ruby Symbol or String to Rust String
14
+ pub fn symbol_to_string(value: Value) -> Result<String, Error> {
15
+ if let Some(symbol) = Symbol::from_value(value) {
16
+ Ok(symbol.name()?.to_string())
17
+ } else {
18
+ String::try_convert(value)
19
+ }
20
+ }
21
+
22
+ /// Get keyword argument from hash (supports both symbol and string keys)
23
+ pub fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
24
+ hash.get(name).or_else(|| {
25
+ let sym = ruby.intern(name);
26
+ hash.get(sym)
27
+ })
28
+ }
29
+
30
+ /// Set a hash entry with a string key
31
+ pub fn set_hash_entry(_ruby: &Ruby, hash: &RHash, key: &str, value: Value) -> Result<(), Error> {
32
+ hash.aset(key, value)?;
33
+ Ok(())
34
+ }
35
+
36
+ /// Convert serde_json Value to Ruby Value
37
+ pub fn json_value_to_ruby(ruby: &Ruby, value: &serde_json::Value) -> Result<Value, Error> {
38
+ Ok(match value {
39
+ serde_json::Value::Null => ruby.qnil().as_value(),
40
+ serde_json::Value::Bool(b) => {
41
+ if *b {
42
+ ruby.qtrue().as_value()
43
+ } else {
44
+ ruby.qfalse().as_value()
45
+ }
46
+ }
47
+ serde_json::Value::Number(num) => {
48
+ if let Some(i) = num.as_i64() {
49
+ ruby.integer_from_i64(i).into_value_with(ruby)
50
+ } else if let Some(u) = num.as_u64() {
51
+ ruby.integer_from_u64(u).into_value_with(ruby)
52
+ } else if let Some(f) = num.as_f64() {
53
+ ruby.float_from_f64(f).into_value_with(ruby)
54
+ } else {
55
+ ruby.qnil().as_value()
56
+ }
57
+ }
58
+ serde_json::Value::String(s) => ruby.str_new(s).into_value_with(ruby),
59
+ serde_json::Value::Array(items) => {
60
+ let ary = ruby.ary_new();
61
+ for item in items {
62
+ ary.push(json_value_to_ruby(ruby, item)?)?;
63
+ }
64
+ ary.into_value_with(ruby)
65
+ }
66
+ serde_json::Value::Object(map) => {
67
+ let hash = ruby.hash_new();
68
+ for (key, val) in map {
69
+ let key_value = ruby.str_new(key).into_value_with(ruby);
70
+ let val_value = json_value_to_ruby(ruby, val)?;
71
+ hash.aset(key_value, val_value)?;
72
+ }
73
+ hash.into_value_with(ruby)
74
+ }
75
+ })
76
+ }
77
+
78
+ /// Convert Ruby key (String or Symbol) to Rust String
79
+ pub fn ruby_key_to_string(value: Value) -> Result<String, Error> {
80
+ if let Ok(sym) = Symbol::try_convert(value) {
81
+ Ok(sym.name()?.to_string())
82
+ } else {
83
+ String::try_convert(value)
84
+ }
85
+ }
86
+
87
+ /// Convert Ruby Value to serde_json Value
88
+ pub fn ruby_value_to_json(value: Value) -> Result<serde_json::Value, Error> {
89
+ let ruby = Ruby::get().expect("Ruby not initialized");
90
+
91
+ if value.is_nil() {
92
+ return Ok(serde_json::Value::Null);
93
+ }
94
+
95
+ if value.equal(ruby.qtrue())? {
96
+ return Ok(serde_json::Value::Bool(true));
97
+ }
98
+
99
+ if value.equal(ruby.qfalse())? {
100
+ return Ok(serde_json::Value::Bool(false));
101
+ }
102
+
103
+ if let Ok(integer) = i64::try_convert(value) {
104
+ return Ok(serde_json::Value::Number(integer.into()));
105
+ }
106
+
107
+ if let Ok(unsigned) = u64::try_convert(value) {
108
+ return Ok(serde_json::Value::Number(serde_json::Number::from(unsigned)));
109
+ }
110
+
111
+ if let Ok(float) = f64::try_convert(value)
112
+ && let Some(num) = serde_json::Number::from_f64(float)
113
+ {
114
+ return Ok(serde_json::Value::Number(num));
115
+ }
116
+
117
+ if let Ok(sym) = Symbol::try_convert(value) {
118
+ return Ok(serde_json::Value::String(sym.name()?.to_string()));
119
+ }
120
+
121
+ if let Ok(string) = String::try_convert(value) {
122
+ return Ok(serde_json::Value::String(string));
123
+ }
124
+
125
+ if let Ok(array) = RArray::try_convert(value) {
126
+ let mut values = Vec::with_capacity(array.len());
127
+ for item in array.into_iter() {
128
+ values.push(ruby_value_to_json(item)?);
129
+ }
130
+ return Ok(serde_json::Value::Array(values));
131
+ }
132
+
133
+ if let Ok(hash) = RHash::try_convert(value) {
134
+ let mut map = serde_json::Map::new();
135
+ hash.foreach(|key: Value, val: Value| {
136
+ let key_string = ruby_key_to_string(key)?;
137
+ let json_value = ruby_value_to_json(val)?;
138
+ map.insert(key_string, json_value);
139
+ Ok(magnus::r_hash::ForEach::Continue)
140
+ })?;
141
+
142
+ return Ok(serde_json::Value::Object(map));
143
+ }
144
+
145
+ Err(runtime_error("Unsupported Ruby value for JSON conversion"))
146
+ }
147
+
148
+ /// Get the cache root directory
149
+ pub fn cache_root_dir() -> Result<PathBuf, Error> {
150
+ std::env::current_dir()
151
+ .map(|dir| dir.join(".kreuzberg"))
152
+ .map_err(|e| runtime_error(format!("Failed to get current directory: {}", e)))
153
+ }
154
+
155
+ /// Get all cache directories (root and subdirectories)
156
+ pub fn cache_directories(root: &Path) -> Result<Vec<PathBuf>, Error> {
157
+ if !root.exists() {
158
+ return Ok(vec![]);
159
+ }
160
+
161
+ let mut dirs = vec![root.to_path_buf()];
162
+ let entries = fs::read_dir(root).map_err(|e| runtime_error(format!("Failed to read cache root: {}", e)))?;
163
+
164
+ for entry in entries {
165
+ let entry = entry.map_err(|e| runtime_error(format!("Failed to read cache directory entry: {}", e)))?;
166
+ if entry
167
+ .file_type()
168
+ .map_err(|e| runtime_error(format!("Failed to determine cache entry type: {}", e)))?
169
+ .is_dir()
170
+ {
171
+ dirs.push(entry.path());
172
+ }
173
+ }
174
+
175
+ Ok(dirs)
176
+ }