kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,125 @@
1
+ //! Error handling and conversion to Ruby exceptions
2
+ //!
3
+ //! Provides error conversion from Kreuzberg errors to Magnus Ruby exceptions,
4
+ //! panic context retrieval, and error code utilities.
5
+
6
+ use kreuzberg::KreuzbergError;
7
+ use magnus::{Error, exception::ExceptionClass, Ruby};
8
+ use std::ffi::CStr;
9
+
10
+ pub use kreuzberg_ffi::{
11
+ kreuzberg_free_string, kreuzberg_last_error_code,
12
+ kreuzberg_last_panic_context,
13
+ };
14
+
15
+ /// Retrieve panic context from FFI if available
16
+ pub fn get_panic_context() -> Option<String> {
17
+ unsafe {
18
+ let ctx_ptr = kreuzberg_last_panic_context();
19
+ if ctx_ptr.is_null() {
20
+ return None;
21
+ }
22
+
23
+ let c_str = CStr::from_ptr(ctx_ptr);
24
+ let context = c_str.to_string_lossy().to_string();
25
+ kreuzberg_free_string(ctx_ptr as *mut std::ffi::c_char);
26
+
27
+ if context.is_empty() { None } else { Some(context) }
28
+ }
29
+ }
30
+
31
+ /// Retrieve error code from FFI
32
+ pub fn get_error_code() -> i32 {
33
+ unsafe { kreuzberg_last_error_code() }
34
+ }
35
+
36
+ /// Convert Kreuzberg errors to Ruby exceptions
37
+ pub fn kreuzberg_error(err: KreuzbergError) -> Error {
38
+ let ruby = Ruby::get().expect("Ruby not initialized");
39
+
40
+ let fetch_error_class = |name: &str| -> Option<ExceptionClass> {
41
+ ruby.eval::<ExceptionClass>(&format!("Kreuzberg::Errors::{}", name))
42
+ .ok()
43
+ };
44
+
45
+ match err {
46
+ KreuzbergError::Validation { message, .. } => {
47
+ if let Some(class) = fetch_error_class("ValidationError") {
48
+ Error::new(class, message)
49
+ } else {
50
+ Error::new(ruby.exception_arg_error(), message)
51
+ }
52
+ }
53
+ KreuzbergError::Parsing { message, .. } => {
54
+ if let Some(class) = fetch_error_class("ParsingError") {
55
+ Error::new(class, message)
56
+ } else {
57
+ Error::new(ruby.exception_runtime_error(), format!("ParsingError: {}", message))
58
+ }
59
+ }
60
+ KreuzbergError::Ocr { message, .. } => {
61
+ if let Some(class) = fetch_error_class("OCRError") {
62
+ Error::new(class, message)
63
+ } else {
64
+ Error::new(ruby.exception_runtime_error(), format!("OCRError: {}", message))
65
+ }
66
+ }
67
+ KreuzbergError::MissingDependency(message) => {
68
+ if let Some(class) = fetch_error_class("MissingDependencyError") {
69
+ Error::new(class, message)
70
+ } else {
71
+ Error::new(
72
+ ruby.exception_runtime_error(),
73
+ format!("MissingDependencyError: {}", message),
74
+ )
75
+ }
76
+ }
77
+ KreuzbergError::Plugin { message, plugin_name } => {
78
+ if let Some(class) = fetch_error_class("PluginError") {
79
+ Error::new(class, format!("{}: {}", plugin_name, message))
80
+ } else {
81
+ Error::new(
82
+ ruby.exception_runtime_error(),
83
+ format!("Plugin error in '{}': {}", plugin_name, message),
84
+ )
85
+ }
86
+ }
87
+ KreuzbergError::Io(err) => {
88
+ if let Some(class) = fetch_error_class("IOError") {
89
+ Error::new(class, err.to_string())
90
+ } else {
91
+ Error::new(ruby.exception_runtime_error(), format!("IO error: {}", err))
92
+ }
93
+ }
94
+ KreuzbergError::UnsupportedFormat(message) => {
95
+ if let Some(class) = fetch_error_class("UnsupportedFormatError") {
96
+ Error::new(class, message)
97
+ } else {
98
+ Error::new(
99
+ ruby.exception_runtime_error(),
100
+ format!("UnsupportedFormatError: {}", message),
101
+ )
102
+ }
103
+ }
104
+ other => Error::new(ruby.exception_runtime_error(), other.to_string()),
105
+ }
106
+ }
107
+
108
+ /// Create a generic runtime error
109
+ pub fn runtime_error(message: impl Into<String>) -> Error {
110
+ let ruby = Ruby::get().expect("Ruby not initialized");
111
+ Error::new(ruby.exception_runtime_error(), message.into())
112
+ }
113
+
114
+ /// Create a validation error (Kreuzberg::Errors::ValidationError)
115
+ pub fn validation_error(message: impl Into<String>) -> Error {
116
+ let ruby = Ruby::get().expect("Ruby not initialized");
117
+
118
+ // Try to get the ValidationError class from Ruby
119
+ if let Ok(class) = ruby.eval::<ExceptionClass>("Kreuzberg::Errors::ValidationError") {
120
+ Error::new(class, message.into())
121
+ } else {
122
+ // Fall back to ArgumentError if the class doesn't exist
123
+ Error::new(ruby.exception_arg_error(), message.into())
124
+ }
125
+ }
@@ -0,0 +1,79 @@
1
+ //! File extraction functions
2
+ //!
3
+ //! Handles extraction from files and byte arrays (synchronous and asynchronous).
4
+
5
+ use crate::config::parse_extraction_config;
6
+ use crate::error_handling::kreuzberg_error;
7
+ use crate::result::extraction_result_to_ruby;
8
+
9
+ use magnus::{Error, RHash, RString, Ruby, Value, scan_args::scan_args};
10
+
11
+ /// Extract content from a file (synchronous)
12
+ pub fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
13
+ let ruby = Ruby::get().expect("Ruby not initialized");
14
+ let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
15
+ let (path,) = args.required;
16
+ let (mime_type,) = args.optional;
17
+ let opts = Some(args.keywords);
18
+
19
+ let config = parse_extraction_config(&ruby, opts)?;
20
+
21
+ let result = kreuzberg::extract_file_sync(&path, mime_type.as_deref(), &config).map_err(kreuzberg_error)?;
22
+
23
+ extraction_result_to_ruby(&ruby, result)
24
+ }
25
+
26
+ /// Extract content from bytes (synchronous)
27
+ pub fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
28
+ let ruby = Ruby::get().expect("Ruby not initialized");
29
+ let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
30
+ let (data, mime_type) = args.required;
31
+ let opts = Some(args.keywords);
32
+
33
+ let config = parse_extraction_config(&ruby, opts)?;
34
+
35
+ let bytes = unsafe { data.as_slice() };
36
+ let result = kreuzberg::extract_bytes_sync(bytes, &mime_type, &config).map_err(kreuzberg_error)?;
37
+
38
+ extraction_result_to_ruby(&ruby, result)
39
+ }
40
+
41
+ /// Extract content from a file (asynchronous)
42
+ pub fn extract_file(args: &[Value]) -> Result<RHash, Error> {
43
+ let ruby = Ruby::get().expect("Ruby not initialized");
44
+ let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
45
+ let (path,) = args.required;
46
+ let (mime_type,) = args.optional;
47
+ let opts = Some(args.keywords);
48
+
49
+ let config = parse_extraction_config(&ruby, opts)?;
50
+
51
+ let runtime =
52
+ tokio::runtime::Runtime::new().map_err(|e| crate::error_handling::runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
53
+
54
+ let result = runtime
55
+ .block_on(async { kreuzberg::extract_file(&path, mime_type.as_deref(), &config).await })
56
+ .map_err(kreuzberg_error)?;
57
+
58
+ extraction_result_to_ruby(&ruby, result)
59
+ }
60
+
61
+ /// Extract content from bytes (asynchronous)
62
+ pub fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
63
+ let ruby = Ruby::get().expect("Ruby not initialized");
64
+ let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
65
+ let (data, mime_type) = args.required;
66
+ let opts = Some(args.keywords);
67
+
68
+ let config = parse_extraction_config(&ruby, opts)?;
69
+
70
+ let runtime =
71
+ tokio::runtime::Runtime::new().map_err(|e| crate::error_handling::runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
72
+
73
+ let bytes = unsafe { data.as_slice() };
74
+ let result = runtime
75
+ .block_on(async { kreuzberg::extract_bytes(bytes, &mime_type, &config).await })
76
+ .map_err(kreuzberg_error)?;
77
+
78
+ extraction_result_to_ruby(&ruby, result)
79
+ }
@@ -0,0 +1,35 @@
1
+ //! GC-guarded Ruby value wrapper for plugin registrations
2
+ //!
3
+ //! Keeps Ruby values alive across plugin registrations by informing the Ruby GC.
4
+
5
+ use magnus::{Ruby, Value};
6
+
7
+ /// Keeps Ruby values alive across plugin registrations by informing the GC.
8
+ ///
9
+ /// This prevents Ruby objects (like Procs) from being garbage collected while
10
+ /// they're being used as plugin callbacks.
11
+ pub struct GcGuardedValue {
12
+ value: Value,
13
+ }
14
+
15
+ impl GcGuardedValue {
16
+ /// Create a new GC-guarded value
17
+ pub fn new(value: Value) -> Self {
18
+ let ruby = Ruby::get().expect("Ruby not initialized");
19
+ ruby.gc_register_address(&value);
20
+ Self { value }
21
+ }
22
+
23
+ /// Get the wrapped value
24
+ pub fn value(&self) -> Value {
25
+ self.value
26
+ }
27
+ }
28
+
29
+ impl Drop for GcGuardedValue {
30
+ fn drop(&mut self) {
31
+ if let Ok(ruby) = Ruby::get() {
32
+ ruby.gc_unregister_address(&self.value);
33
+ }
34
+ }
35
+ }
@@ -0,0 +1,176 @@
1
+ //! Helper utilities for Ruby value conversion and manipulation
2
+ //!
3
+ //! Provides utilities for converting between Ruby and JSON values,
4
+ //! accessing keyword arguments, and managing cache directories.
5
+
6
+ use magnus::{Error, RArray, RHash, Ruby, Symbol, Value, TryConvert, IntoValue};
7
+ use magnus::value::ReprValue;
8
+ use std::fs;
9
+ use std::path::{Path, PathBuf};
10
+
11
+ use crate::error_handling::runtime_error;
12
+
13
+ /// Convert Ruby Symbol or String to Rust String
14
+ pub fn symbol_to_string(value: Value) -> Result<String, Error> {
15
+ if let Some(symbol) = Symbol::from_value(value) {
16
+ Ok(symbol.name()?.to_string())
17
+ } else {
18
+ String::try_convert(value)
19
+ }
20
+ }
21
+
22
+ /// Get keyword argument from hash (supports both symbol and string keys)
23
+ pub fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
24
+ hash.get(name).or_else(|| {
25
+ let sym = ruby.intern(name);
26
+ hash.get(sym)
27
+ })
28
+ }
29
+
30
+ /// Set a hash entry with a string key
31
+ pub fn set_hash_entry(_ruby: &Ruby, hash: &RHash, key: &str, value: Value) -> Result<(), Error> {
32
+ hash.aset(key, value)?;
33
+ Ok(())
34
+ }
35
+
36
+ /// Convert serde_json Value to Ruby Value
37
+ pub fn json_value_to_ruby(ruby: &Ruby, value: &serde_json::Value) -> Result<Value, Error> {
38
+ Ok(match value {
39
+ serde_json::Value::Null => ruby.qnil().as_value(),
40
+ serde_json::Value::Bool(b) => {
41
+ if *b {
42
+ ruby.qtrue().as_value()
43
+ } else {
44
+ ruby.qfalse().as_value()
45
+ }
46
+ }
47
+ serde_json::Value::Number(num) => {
48
+ if let Some(i) = num.as_i64() {
49
+ ruby.integer_from_i64(i).into_value_with(ruby)
50
+ } else if let Some(u) = num.as_u64() {
51
+ ruby.integer_from_u64(u).into_value_with(ruby)
52
+ } else if let Some(f) = num.as_f64() {
53
+ ruby.float_from_f64(f).into_value_with(ruby)
54
+ } else {
55
+ ruby.qnil().as_value()
56
+ }
57
+ }
58
+ serde_json::Value::String(s) => ruby.str_new(s).into_value_with(ruby),
59
+ serde_json::Value::Array(items) => {
60
+ let ary = ruby.ary_new();
61
+ for item in items {
62
+ ary.push(json_value_to_ruby(ruby, item)?)?;
63
+ }
64
+ ary.into_value_with(ruby)
65
+ }
66
+ serde_json::Value::Object(map) => {
67
+ let hash = ruby.hash_new();
68
+ for (key, val) in map {
69
+ let key_value = ruby.str_new(key).into_value_with(ruby);
70
+ let val_value = json_value_to_ruby(ruby, val)?;
71
+ hash.aset(key_value, val_value)?;
72
+ }
73
+ hash.into_value_with(ruby)
74
+ }
75
+ })
76
+ }
77
+
78
+ /// Convert Ruby key (String or Symbol) to Rust String
79
+ pub fn ruby_key_to_string(value: Value) -> Result<String, Error> {
80
+ if let Ok(sym) = Symbol::try_convert(value) {
81
+ Ok(sym.name()?.to_string())
82
+ } else {
83
+ String::try_convert(value)
84
+ }
85
+ }
86
+
87
+ /// Convert Ruby Value to serde_json Value
88
+ pub fn ruby_value_to_json(value: Value) -> Result<serde_json::Value, Error> {
89
+ let ruby = Ruby::get().expect("Ruby not initialized");
90
+
91
+ if value.is_nil() {
92
+ return Ok(serde_json::Value::Null);
93
+ }
94
+
95
+ if value.equal(ruby.qtrue())? {
96
+ return Ok(serde_json::Value::Bool(true));
97
+ }
98
+
99
+ if value.equal(ruby.qfalse())? {
100
+ return Ok(serde_json::Value::Bool(false));
101
+ }
102
+
103
+ if let Ok(integer) = i64::try_convert(value) {
104
+ return Ok(serde_json::Value::Number(integer.into()));
105
+ }
106
+
107
+ if let Ok(unsigned) = u64::try_convert(value) {
108
+ return Ok(serde_json::Value::Number(serde_json::Number::from(unsigned)));
109
+ }
110
+
111
+ if let Ok(float) = f64::try_convert(value)
112
+ && let Some(num) = serde_json::Number::from_f64(float)
113
+ {
114
+ return Ok(serde_json::Value::Number(num));
115
+ }
116
+
117
+ if let Ok(sym) = Symbol::try_convert(value) {
118
+ return Ok(serde_json::Value::String(sym.name()?.to_string()));
119
+ }
120
+
121
+ if let Ok(string) = String::try_convert(value) {
122
+ return Ok(serde_json::Value::String(string));
123
+ }
124
+
125
+ if let Ok(array) = RArray::try_convert(value) {
126
+ let mut values = Vec::with_capacity(array.len());
127
+ for item in array.into_iter() {
128
+ values.push(ruby_value_to_json(item)?);
129
+ }
130
+ return Ok(serde_json::Value::Array(values));
131
+ }
132
+
133
+ if let Ok(hash) = RHash::try_convert(value) {
134
+ let mut map = serde_json::Map::new();
135
+ hash.foreach(|key: Value, val: Value| {
136
+ let key_string = ruby_key_to_string(key)?;
137
+ let json_value = ruby_value_to_json(val)?;
138
+ map.insert(key_string, json_value);
139
+ Ok(magnus::r_hash::ForEach::Continue)
140
+ })?;
141
+
142
+ return Ok(serde_json::Value::Object(map));
143
+ }
144
+
145
+ Err(runtime_error("Unsupported Ruby value for JSON conversion"))
146
+ }
147
+
148
+ /// Get the cache root directory
149
+ pub fn cache_root_dir() -> Result<PathBuf, Error> {
150
+ std::env::current_dir()
151
+ .map(|dir| dir.join(".kreuzberg"))
152
+ .map_err(|e| runtime_error(format!("Failed to get current directory: {}", e)))
153
+ }
154
+
155
+ /// Get all cache directories (root and subdirectories)
156
+ pub fn cache_directories(root: &Path) -> Result<Vec<PathBuf>, Error> {
157
+ if !root.exists() {
158
+ return Ok(vec![]);
159
+ }
160
+
161
+ let mut dirs = vec![root.to_path_buf()];
162
+ let entries = fs::read_dir(root).map_err(|e| runtime_error(format!("Failed to read cache root: {}", e)))?;
163
+
164
+ for entry in entries {
165
+ let entry = entry.map_err(|e| runtime_error(format!("Failed to read cache directory entry: {}", e)))?;
166
+ if entry
167
+ .file_type()
168
+ .map_err(|e| runtime_error(format!("Failed to determine cache entry type: {}", e)))?
169
+ .is_dir()
170
+ {
171
+ dirs.push(entry.path());
172
+ }
173
+ }
174
+
175
+ Ok(dirs)
176
+ }