whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (610) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +99 -2
  4. data/ext/extconf.rb +1 -0
  5. data/ext/ruby_whisper.c +20 -4
  6. data/ext/ruby_whisper.h +30 -2
  7. data/ext/ruby_whisper_context.c +216 -124
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +0 -1
  10. data/ext/ruby_whisper_params.c +0 -1
  11. data/ext/ruby_whisper_segment.c +0 -1
  12. data/ext/ruby_whisper_token.c +29 -9
  13. data/ext/ruby_whisper_transcribe.cpp +4 -1
  14. data/ext/ruby_whisper_vad_context.c +48 -1
  15. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +0 -1
  18. data/ext/ruby_whisper_vad_segments.c +0 -1
  19. data/ext/sources/CMakeLists.txt +1 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  22. data/ext/sources/examples/bench/bench.cpp +23 -18
  23. data/ext/sources/examples/cli/cli.cpp +8 -0
  24. data/ext/sources/examples/common-ggml.cpp +2 -0
  25. data/ext/sources/examples/miniaudio.h +4507 -2131
  26. data/ext/sources/examples/server/server.cpp +18 -4
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
  31. data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
  33. data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
  34. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  35. data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
  36. data/ext/sources/examples/talk-llama/llama-context.h +27 -28
  37. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
  38. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
  40. data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
  41. data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
  42. data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
  43. data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
  44. data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
  45. data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
  49. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  50. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  51. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
  52. data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
  53. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
  54. data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
  55. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
  56. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  57. data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
  58. data/ext/sources/examples/talk-llama/llama-model.h +72 -19
  59. data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
  60. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
  61. data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
  62. data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
  63. data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
  64. data/ext/sources/examples/talk-llama/llama.cpp +76 -22
  65. data/ext/sources/examples/talk-llama/llama.h +63 -30
  66. data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
  67. data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
  68. data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
  69. data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
  70. data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
  71. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
  72. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
  73. data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
  74. data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
  75. data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
  76. data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
  77. data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
  78. data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
  79. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
  80. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
  81. data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
  82. data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
  83. data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
  84. data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
  85. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
  86. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  87. data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
  88. data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
  89. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
  90. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
  91. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  92. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
  108. data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
  109. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
  110. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
  111. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
  112. data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
  113. data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
  114. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
  116. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  117. data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
  118. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
  119. data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
  120. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
  121. data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
  122. data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
  123. data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
  124. data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
  125. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
  126. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
  127. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
  128. data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
  129. data/ext/sources/examples/talk-llama/models/models.h +181 -46
  130. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
  131. data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
  132. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
  133. data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
  134. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
  135. data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
  136. data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
  137. data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
  138. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
  139. data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
  140. data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
  141. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  142. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
  143. data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
  144. data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
  145. data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
  146. data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
  147. data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
  148. data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
  149. data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
  150. data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
  151. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
  152. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
  153. data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
  154. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  155. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  156. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
  157. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
  158. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
  159. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
  160. data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
  161. data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
  162. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
  163. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
  164. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
  165. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
  166. data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
  167. data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
  168. data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
  169. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
  170. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  171. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
  172. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
  173. data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
  174. data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
  175. data/ext/sources/ggml/CMakeLists.txt +9 -3
  176. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  177. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  179. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  180. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  181. data/ext/sources/ggml/include/ggml-rpc.h +6 -1
  182. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  183. data/ext/sources/ggml/include/ggml.h +56 -9
  184. data/ext/sources/ggml/src/CMakeLists.txt +3 -0
  185. data/ext/sources/ggml/src/ggml-alloc.c +4 -9
  186. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  187. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  188. data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
  189. data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  191. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
  192. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  193. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  194. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
  195. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
  196. data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
  197. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
  198. data/ext/sources/ggml/src/ggml-common.h +11 -0
  199. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
  200. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
  201. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  202. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  203. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
  204. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
  205. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  208. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  209. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
  210. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
  211. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  212. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  213. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
  214. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
  215. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
  216. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  217. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
  218. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
  219. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
  220. data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
  221. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  222. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  223. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
  224. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  225. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  226. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
  227. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  228. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
  229. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
  231. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
  232. data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
  233. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
  235. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
  236. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
  237. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
  238. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
  239. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  241. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
  242. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  243. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  244. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
  245. data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
  246. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
  247. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  248. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
  249. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
  250. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
  251. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  252. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
  253. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
  254. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  255. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  256. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
  257. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  258. data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
  259. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  260. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  261. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
  262. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  269. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
  270. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
  271. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
  272. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
  274. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
  278. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  297. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  317. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  321. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
  322. data/ext/sources/ggml/src/ggml-impl.h +62 -0
  323. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  324. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  325. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  326. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  327. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
  328. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
  329. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
  330. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
  335. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
  336. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  367. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  368. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  369. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  370. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  371. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  372. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  373. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  374. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  375. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  376. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  377. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  378. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  379. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  380. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  381. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  382. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  383. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  384. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  385. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  386. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  387. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  388. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  389. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  390. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  391. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  392. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  393. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  394. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  395. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  396. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  397. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  398. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  399. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  400. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  401. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  402. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  403. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  404. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  405. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  406. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  407. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  412. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  413. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  414. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  415. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  416. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
  417. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  418. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  420. data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
  421. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
  422. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  423. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  424. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  425. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
  426. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  427. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  428. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  429. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  430. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  431. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  432. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  433. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  434. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  435. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
  436. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  437. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  438. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  439. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  440. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  441. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  442. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  443. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  444. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  445. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  446. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  447. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  448. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  449. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  450. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  451. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  452. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  453. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  454. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  455. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  456. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  457. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  458. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  459. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  460. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  461. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  462. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  463. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  464. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  465. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  466. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  467. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  468. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  469. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  470. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  471. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  472. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  473. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  474. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  475. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  476. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  477. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  478. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  479. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  480. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  481. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  482. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  483. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  484. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  485. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  486. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  487. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  488. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
  489. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  490. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  491. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  492. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  493. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  494. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  495. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  496. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  497. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  498. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  499. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  500. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  501. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  502. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  503. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  504. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  505. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  506. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  507. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  508. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  509. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  510. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  511. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  512. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  513. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  514. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  515. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  516. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  517. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  518. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  519. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  520. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  521. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  522. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  523. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  524. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  525. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  526. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  527. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  528. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  529. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  530. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  531. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  532. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
  533. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
  560. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
  561. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
  562. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  563. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  564. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  565. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  566. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  567. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  568. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
  569. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  570. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
  571. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
  572. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  573. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
  574. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  584. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  585. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  586. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
  587. data/ext/sources/ggml/src/ggml.c +167 -33
  588. data/ext/sources/ggml/src/gguf.cpp +229 -44
  589. data/ext/sources/src/whisper.cpp +6 -28
  590. data/sig/whisper.rbs +43 -2
  591. data/test/test_context_params.rb +82 -0
  592. data/test/test_token.rb +11 -0
  593. data/test/test_vad_context.rb +58 -8
  594. data/test/test_whisper.rb +20 -0
  595. data/whispercpp.gemspec +1 -1
  596. metadata +240 -28
  597. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  598. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  599. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  600. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  601. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  602. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  603. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  604. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  605. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  606. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  607. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  608. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  609. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  610. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
@@ -0,0 +1,1110 @@
1
+ #include "ggml-openvino.h"
2
+
3
+ #include "ggml-backend-impl.h"
4
+ #include "ggml-backend.h"
5
+ #include "ggml-impl.h"
6
+ #include "ggml-openvino-extra.h"
7
+ #include "ggml-openvino/utils.h"
8
+ #include "ggml-quants.h"
9
+ #include "ggml.h"
10
+
11
+ #include <atomic>
12
+ #include <cstdlib>
13
+ #include <cstdint>
14
+ #include <cstring>
15
+ #include <memory>
16
+ #include <mutex>
17
+ #include <openvino/core/type/element_type.hpp>
18
+ #include <openvino/openvino.hpp>
19
+ #include <openvino/runtime/allocator.hpp>
20
+ #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
21
+ #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
22
+ #include <openvino/runtime/tensor.hpp>
23
+ #include <set>
24
+ #include <string>
25
+ #include <vector>
26
+
27
+ #if defined(_WIN32)
28
+ # define WIN32_LEAN_AND_MEAN
29
+ # ifndef NOMINMAX
30
+ # define NOMINMAX
31
+ # endif
32
+ # include <windows.h>
33
+ #else
34
+ # include <unistd.h>
35
+ #endif
36
+
37
+ // =====================================================
38
+ // OpenVINO Buffer Implementation using ov::Tensor
39
+ // =====================================================
40
+ //
41
+ // Design: This implementation uses a hybrid approach:
42
+ // 1. For weight tensors: Store a pre-built ov::op::v0::Constant in tensor->extra
43
+ // - This avoids the memcpy during graph construction
44
+ // - For quantized weights, the constant is already converted to OpenVINO format
45
+ // 2. For KV cache / compute tensors: Store an ov::Tensor in tensor->extra
46
+ // - This can be directly passed to infer_request
47
+ // - Future: can be changed to ov::RemoteTensor for GPU/NPU
48
+ //
49
+ // This design is similar to:
50
+ // - CUDA split buffer: tensor->extra stores device pointers
51
+ // - CPU repack buffer: tensor->extra stores tensor_traits with repacked data
52
+ // =====================================================
53
+
54
+ // Buffer context that manages per-tensor allocations (no contiguous buffer for weights)
55
+ struct ggml_backend_openvino_buffer_context {
56
+ int device;
57
+ std::string name;
58
+ size_t id;
59
+
60
+ // For non-weight buffers (KV cache, compute), we still use contiguous allocation
61
+ void * data;
62
+ size_t size;
63
+ bool is_remote;
64
+
65
+ // Wrapping of the buffer
66
+ std::shared_ptr<ov::Tensor> ov_buffer;
67
+
68
+ // Track all extras for cleanup
69
+ std::map<ggml_tensor *, ggml_openvino_extra_base *> tensor_extras;
70
+
71
+ // Used for re-allocation on device for kvcache
72
+ void * data_prev;
73
+
74
+ ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) :
75
+ device(device),
76
+ name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
77
+ id([]() {
78
+ static std::atomic<size_t> next_id{1};
79
+ return next_id.fetch_add(1);
80
+ }()),
81
+ data(nullptr),
82
+ size(size),
83
+ is_remote(is_remote) {
84
+ if (size == 0) {
85
+ return;
86
+ }
87
+
88
+ const auto & device_name = ggml_openvino_get_device_name();
89
+
90
+ if (is_remote) {
91
+ GGML_ASSERT(device_name == "GPU");
92
+ auto remote_context = ggml_openvino_get_remote_context();
93
+ auto gpu_context = remote_context->as<ov::intel_gpu::ocl::ClContext>();
94
+ ov::intel_gpu::ocl::USMTensor usm_tensor =
95
+ gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size});
96
+ data = usm_tensor.get();
97
+ ov_buffer = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
98
+ } else {
99
+ data = ggml_aligned_malloc(size);
100
+ ov_buffer = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape{size}, data);
101
+ }
102
+
103
+ if (data == nullptr) {
104
+ GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size);
105
+ return;
106
+ }
107
+
108
+ if (reinterpret_cast<uintptr_t>(data) % TENSOR_ALIGNMENT != 0) {
109
+ GGML_LOG_ERROR("%s: %s buffer is not aligned to %d bytes\n", __func__, device_name.c_str(),
110
+ TENSOR_ALIGNMENT);
111
+ GGML_ABORT("fatal error");
112
+ }
113
+ }
114
+
115
+ ~ggml_backend_openvino_buffer_context() {
116
+ // Clean up all tensor extras
117
+ // GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device,
118
+ // size / 1024 / 1024);
119
+ for (auto & pair : tensor_extras) {
120
+ delete pair.second;
121
+ }
122
+ tensor_extras.clear();
123
+ if (!is_remote && data != nullptr) {
124
+ ggml_aligned_free(data, size);
125
+ }
126
+ }
127
+ };
128
+
129
+ // Buffer type context (per-device)
130
+ struct ggml_backend_openvino_buffer_type_context {
131
+ int device;
132
+ std::string name;
133
+ };
134
+
135
+ // Buffer interface functions
136
+ static void ggml_backend_openvino_buffer_free_buffer(ggml_backend_buffer_t buffer) {
137
+ ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
138
+ delete ctx;
139
+ }
140
+
141
+ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer) {
142
+ ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
143
+ return ctx->data;
144
+ }
145
+
146
+ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
147
+ // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
148
+ ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
149
+
150
+ // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
151
+ if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
152
+ !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
153
+ GGML_ASSERT(ctx->tensor_extras.empty());
154
+ auto device = ctx->device;
155
+ auto size = ctx->size;
156
+ auto * data_prev = ctx->data;
157
+ delete ctx;
158
+ ctx = new ggml_backend_openvino_buffer_context(device, size, true);
159
+ buffer->context = ctx;
160
+ tensor->data = (char *) ctx->data + ((char *) tensor->data - (char *) data_prev);
161
+ }
162
+
163
+ // Views share the extra from view_src
164
+ if (tensor->view_src != nullptr) {
165
+ GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
166
+ if (tensor->view_src->extra != nullptr) {
167
+ tensor->extra = tensor->view_src->extra;
168
+ }
169
+ return GGML_STATUS_SUCCESS;
170
+ }
171
+
172
+ ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
173
+
174
+ if (tensor->data != nullptr && !ggml_is_quantized(tensor->type)) {
175
+ ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor, ctx->is_remote);
176
+ if (extra != nullptr) {
177
+ auto it = ctx->tensor_extras.find(tensor);
178
+ if (it != ctx->tensor_extras.end()) {
179
+ delete it->second;
180
+ }
181
+ ctx->tensor_extras[tensor] = extra;
182
+ tensor->extra = extra;
183
+ }
184
+ }
185
+
186
+ return GGML_STATUS_SUCCESS;
187
+ }
188
+
189
+ static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buffer,
190
+ ggml_tensor * tensor,
191
+ uint8_t value,
192
+ size_t offset,
193
+ size_t size) {
194
+ // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
195
+ GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
196
+ ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
197
+
198
+ if (ctx->is_remote) {
199
+ // For remote (device) buffers, use OpenCL USM memfill
200
+ cl_command_queue queue = ggml_openvino_get_cl_queue();
201
+ auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL();
202
+ if (queue != nullptr && mem_fill_fn != nullptr) {
203
+ uint8_t pattern = value;
204
+ cl_int err = mem_fill_fn(queue, (char *) tensor->data + offset, &pattern, sizeof(pattern), size, 0, nullptr,
205
+ nullptr);
206
+ if (err != CL_SUCCESS) {
207
+ GGML_LOG_ERROR("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err);
208
+ }
209
+ clFinish(queue);
210
+ } else {
211
+ GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer\n", __func__);
212
+ }
213
+ } else {
214
+ memset((char *) tensor->data + offset, value, size);
215
+ }
216
+ }
217
+
218
+ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer,
219
+ ggml_tensor * tensor,
220
+ const void * data,
221
+ size_t offset,
222
+ size_t size) {
223
+ // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
224
+ GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
225
+ ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
226
+
227
+ // Check if this is a weight buffer (usage is set BEFORE set_tensor is called, except in test-backend-ops)
228
+ bool is_weight_buffer = (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
229
+ // Full tensor set: offset=0, full size, not a view
230
+ bool is_full_tensor_set = (offset == 0 && size == ggml_nbytes(tensor) && tensor->view_src == nullptr);
231
+ // 2D tensor (typical weight shape)
232
+ bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1);
233
+
234
+ if (is_weight_buffer && is_full_tensor_set && is_2d) {
235
+ try {
236
+ auto result = process_weight_tensor(tensor, data, tensor->data);
237
+ result.weight_node->set_friendly_name(tensor->name);
238
+
239
+ // const auto & layout = result.layout;
240
+ ggml_openvino_extra_base * extra;
241
+
242
+ // Quantized path with extracted weight/scale/zp tensors
243
+ if (result.is_quantized()) {
244
+ extra = new ggml_openvino_quantized_weight_extra(std::move(result.weights), std::move(result.scales),
245
+ std::move(result.zp), result.weight_node);
246
+
247
+ // if (layout.is_requant) {
248
+ // GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
249
+ // extra_quant_type_name(layout.requant_type.value()), layout.is_u4 ? 4 : 8,
250
+ // layout.weights_per_block);
251
+ // } else {
252
+ // int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
253
+ // GGML_LOG_DEBUG("%s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n",
254
+ // __func__, tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
255
+ // }
256
+ } else {
257
+ // F16/F32/BF16 weight or F16-requant
258
+ extra = new ggml_openvino_weight_extra(std::move(result.weights), result.weight_node);
259
+
260
+ // if (layout.total_size > 0) {
261
+ // GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
262
+ // } else {
263
+ // GGML_LOG_DEBUG("%s: created shared-memory weight node for %s\n", __func__, tensor->name);
264
+ // }
265
+ }
266
+
267
+ ctx->tensor_extras[tensor] = extra;
268
+ tensor->extra = extra;
269
+
270
+ } catch (const std::exception & e) {
271
+ GGML_LOG_ERROR("%s: failed to process weight tensor for %s: %s\n", __func__, tensor->name, e.what());
272
+ memcpy((char *) tensor->data + offset, data, size);
273
+ }
274
+ } else {
275
+ // Non-weight tensor (KV cache, activations, etc.) - copy data. test-backend-ops also goes here
276
+ if (ctx->is_remote) {
277
+ cl_command_queue queue = ggml_openvino_get_cl_queue();
278
+ auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL();
279
+ if (queue != nullptr && mem_cpy_fn != nullptr) {
280
+ cl_int err =
281
+ mem_cpy_fn(queue, CL_TRUE, (char *) tensor->data + offset, data, size, 0, nullptr, nullptr);
282
+ if (err != CL_SUCCESS) {
283
+ GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err);
284
+ }
285
+ } else {
286
+ GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__);
287
+ }
288
+ } else {
289
+ memcpy((char *) tensor->data + offset, data, size);
290
+ }
291
+
292
+ ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor, ctx->is_remote);
293
+ if (extra == nullptr) {
294
+ // GGML_LOG_ERROR("%s: failed to create tensor extra for %s\n", __func__, tensor->name);
295
+ return;
296
+ }
297
+
298
+ auto it = ctx->tensor_extras.find(tensor);
299
+ if (it != ctx->tensor_extras.end()) {
300
+ delete it->second;
301
+ }
302
+ ctx->tensor_extras[tensor] = extra;
303
+ tensor->extra = extra;
304
+ }
305
+ }
306
+
307
+ static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer,
308
+ const ggml_tensor * tensor,
309
+ void * data,
310
+ size_t offset,
311
+ size_t size) {
312
+ // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
313
+ GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
314
+ ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
315
+
316
+ if (ctx->is_remote) {
317
+ // For remote (device) buffers, use OpenCL USM memcpy (device-to-host)
318
+ cl_command_queue queue = ggml_openvino_get_cl_queue();
319
+ auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL();
320
+ if (queue != nullptr && mem_cpy_fn != nullptr) {
321
+ cl_int err =
322
+ mem_cpy_fn(queue, CL_TRUE, data, (const char *) tensor->data + offset, size, 0, nullptr, nullptr);
323
+ if (err != CL_SUCCESS) {
324
+ GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err);
325
+ }
326
+ } else {
327
+ GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__);
328
+ }
329
+ } else {
330
+ memcpy(data, (const char *) tensor->data + offset, size);
331
+ }
332
+ }
333
+
334
+ static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
335
+ const ggml_tensor * src,
336
+ ggml_tensor * dst) {
337
+ // GGML_LOG_DEBUG("%s: src tensor name=%s, dst tensor name=%s\n", __func__, src->name, dst->name);
338
+ GGML_ASSERT(src != nullptr && dst != nullptr);
339
+ ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
340
+
341
+ if (ctx->is_remote) {
342
+ // For remote (device) buffers, use OpenCL USM memcpy
343
+ cl_command_queue queue = ggml_openvino_get_cl_queue();
344
+ auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL();
345
+ if (queue == nullptr || mem_cpy_fn == nullptr) {
346
+ GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__);
347
+ return false;
348
+ }
349
+ // Can copy from host to device
350
+ if (ggml_backend_buffer_is_host(src->buffer)) {
351
+ cl_int err = mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
352
+ if (err != CL_SUCCESS) {
353
+ GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (host-to-device) failed with error %d\n", __func__, err);
354
+ return false;
355
+ }
356
+ return true;
357
+ }
358
+ // Can also copy from device to device if both are OpenVINO remote buffers
359
+ if (ggml_backend_buffer_is_openvino(src->buffer)) {
360
+ ggml_backend_openvino_buffer_context * src_ctx =
361
+ (ggml_backend_openvino_buffer_context *) src->buffer->context;
362
+ if (src_ctx->is_remote) {
363
+ cl_int err =
364
+ mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
365
+ if (err != CL_SUCCESS) {
366
+ GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__,
367
+ err);
368
+ return false;
369
+ }
370
+ return true;
371
+ }
372
+ }
373
+ return false;
374
+ }
375
+
376
+ // Host buffer - can copy from any host buffer
377
+ if (ggml_backend_buffer_is_host(src->buffer)) {
378
+ memcpy(dst->data, src->data, ggml_nbytes(src));
379
+ return true;
380
+ }
381
+ return false;
382
+ }
383
+
384
+ static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
385
+ ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
386
+ GGML_ASSERT(ctx->data != nullptr);
387
+ if (ctx->is_remote) {
388
+ cl_command_queue queue = ggml_openvino_get_cl_queue();
389
+ auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL();
390
+ if (queue != nullptr && mem_fill_fn != nullptr) {
391
+ uint8_t pattern = value;
392
+ cl_int err = mem_fill_fn(queue, ctx->data, &pattern, sizeof(pattern), ctx->size, 0, nullptr, nullptr);
393
+ if (err != CL_SUCCESS) {
394
+ GGML_LOG_WARN("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err);
395
+ }
396
+ clFinish(queue);
397
+ } else {
398
+ GGML_LOG_WARN("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer clear\n",
399
+ __func__);
400
+ }
401
+ } else {
402
+ memset(ctx->data, value, ctx->size);
403
+ }
404
+ }
405
+
406
+ static const ggml_backend_buffer_i ggml_backend_openvino_buffer_interface = {
407
+ /* .free_buffer = */ ggml_backend_openvino_buffer_free_buffer,
408
+ /* .get_base = */ ggml_backend_openvino_buffer_get_base,
409
+ /* .init_tensor = */ ggml_backend_openvino_buffer_init_tensor,
410
+ /* .memset_tensor = */ ggml_backend_openvino_buffer_memset_tensor,
411
+ /* .set_tensor = */ ggml_backend_openvino_buffer_set_tensor,
412
+ /* .get_tensor = */ ggml_backend_openvino_buffer_get_tensor,
413
+ /* .cpy_tensor = */ ggml_backend_openvino_buffer_cpy_tensor,
414
+ /* .clear = */ ggml_backend_openvino_buffer_clear,
415
+ /* .reset = */ NULL,
416
+ };
417
+
418
+ // Buffer type interface functions
419
+ static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
420
+ ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context;
421
+ return ctx->name.c_str();
422
+ }
423
+
424
+ static ggml_backend_buffer_t ggml_backend_openvino_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
425
+ size_t size) {
426
+ ggml_backend_openvino_buffer_type_context * buft_ctx = (ggml_backend_openvino_buffer_type_context *) buft->context;
427
+
428
+ // Create buffer context with contiguous memory allocation
429
+ ggml_backend_openvino_buffer_context * ctx = new ggml_backend_openvino_buffer_context(buft_ctx->device, size);
430
+
431
+ if (ctx->data == nullptr && size > 0) {
432
+ GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
433
+ delete ctx;
434
+ return nullptr;
435
+ }
436
+
437
+ return ggml_backend_buffer_init(buft, ggml_backend_openvino_buffer_interface, ctx, size);
438
+ }
439
+
440
+ static size_t ggml_backend_openvino_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
441
+ GGML_UNUSED(buft);
442
+ return TENSOR_ALIGNMENT;
443
+ }
444
+
445
+ static size_t ggml_backend_openvino_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
446
+ GGML_UNUSED(buft);
447
+ return SIZE_MAX;
448
+ }
449
+
450
+ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
451
+ const ggml_tensor * tensor) {
452
+ GGML_UNUSED(buft);
453
+
454
+ // For quantized 2D tensors (weights), we need extra space for extracted data
455
+ if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) {
456
+ ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor);
457
+ if (layout.total_size > 0) {
458
+ // GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n",
459
+ // __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size,
460
+ // layout.scales_size, layout.zp_size);
461
+ return layout.total_size;
462
+ }
463
+ }
464
+
465
+ return ggml_nbytes(tensor);
466
+ }
467
+
468
+ static const ggml_backend_buffer_type_i ggml_backend_openvino_buffer_type_interface = {
469
+ /* .get_name = */ ggml_backend_openvino_buffer_type_get_name,
470
+ /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer,
471
+ /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment,
472
+ /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size,
473
+ /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size,
474
+ /* .is_host = */ nullptr,
475
+ };
476
+
477
+ // Get buffer type for a specific device
478
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) {
479
+ GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count());
480
+
481
+ static std::mutex mutex;
482
+ std::lock_guard<std::mutex> lock(mutex);
483
+
484
+ static std::vector<ggml_backend_buffer_type> buffer_types;
485
+ static std::vector<ggml_backend_openvino_buffer_type_context> buffer_type_contexts;
486
+
487
+ if (buffer_types.empty()) {
488
+ int device_count = ggml_backend_openvino_get_device_count();
489
+ buffer_types.resize(device_count);
490
+ buffer_type_contexts.resize(device_count);
491
+
492
+ for (int i = 0; i < device_count; i++) {
493
+ buffer_type_contexts[i].device = i;
494
+ buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i);
495
+
496
+ buffer_types[i] = ggml_backend_buffer_type{
497
+ /* .iface = */ ggml_backend_openvino_buffer_type_interface,
498
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i),
499
+ /* .context = */ &buffer_type_contexts[i],
500
+ };
501
+ }
502
+ }
503
+
504
+ return &buffer_types[device];
505
+ }
506
+
507
+ // =====================================================
508
+ // OpenVINO Host Buffer Implementation
509
+ // =====================================================
510
+
511
+ static const char * ggml_backend_openvino_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
512
+ ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context;
513
+ static std::string name;
514
+ name = ctx->name + "_HOST";
515
+ return name.c_str();
516
+ }
517
+
518
+ static bool ggml_backend_openvino_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
519
+ GGML_UNUSED(buft);
520
+ return true;
521
+ }
522
+
523
+ static const ggml_backend_buffer_type_i ggml_backend_openvino_host_buffer_type_interface = {
524
+ /* .get_name = */ ggml_backend_openvino_host_buffer_type_get_name,
525
+ /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer,
526
+ /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment,
527
+ /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size,
528
+ /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size,
529
+ /* .is_host = */ ggml_backend_openvino_host_buffer_type_is_host,
530
+ };
531
+
532
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device) {
533
+ GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count());
534
+
535
+ static std::mutex mutex;
536
+ std::lock_guard<std::mutex> lock(mutex);
537
+
538
+ static std::vector<ggml_backend_buffer_type> buffer_types;
539
+ static std::vector<ggml_backend_openvino_buffer_type_context> buffer_type_contexts;
540
+
541
+ if (buffer_types.empty()) {
542
+ int device_count = ggml_backend_openvino_get_device_count();
543
+ buffer_types.resize(device_count);
544
+ buffer_type_contexts.resize(device_count);
545
+
546
+ for (int i = 0; i < device_count; i++) {
547
+ buffer_type_contexts[i].device = i;
548
+ buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i);
549
+
550
+ buffer_types[i] = ggml_backend_buffer_type{
551
+ /* .iface = */ ggml_backend_openvino_host_buffer_type_interface,
552
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i),
553
+ /* .context = */ &buffer_type_contexts[i],
554
+ };
555
+ }
556
+ }
557
+
558
+ return &buffer_types[device];
559
+ }
560
+
561
+ bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
562
+ return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
563
+ }
564
+
565
+ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
566
+ if (!ggml_backend_buffer_is_openvino(buffer)) {
567
+ return 0;
568
+ }
569
+ ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
570
+ return ctx->id;
571
+ }
572
+
573
+ void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) {
574
+ GGML_ASSERT(tensor != nullptr);
575
+ GGML_ASSERT(tensor->buffer != nullptr);
576
+ GGML_ASSERT(ggml_backend_buffer_is_openvino(tensor->buffer));
577
+
578
+ auto * ctx = static_cast<ggml_backend_openvino_buffer_context *>(tensor->buffer->context);
579
+
580
+ auto it = ctx->tensor_extras.find(tensor);
581
+ if (it != ctx->tensor_extras.end()) {
582
+ delete it->second;
583
+ }
584
+
585
+ ctx->tensor_extras[tensor] = extra;
586
+ tensor->extra = extra;
587
+ }
588
+
589
+ bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
590
+ return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
591
+ }
592
+
593
+ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {
594
+ return buft->iface.get_name == ggml_backend_openvino_host_buffer_type_get_name;
595
+ }
596
+
597
+ static void ggml_backend_openvino_free(ggml_backend_t backend) {
598
+ ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
599
+ delete ctx;
600
+ delete backend;
601
+ }
602
+
603
+ static const char * ggml_backend_openvino_get_name(ggml_backend_t backend) {
604
+ return GGML_OPENVINO_NAME;
605
+ GGML_UNUSED(backend);
606
+ }
607
+
608
+ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
609
+ return ov_graph_compute(cgraph, backend);
610
+ GGML_UNUSED(backend);
611
+ }
612
+
613
+ static const ggml_backend_i ggml_backend_openvino_interface = {
614
+ /* .get_name = */ ggml_backend_openvino_get_name,
615
+ /* .free = */ ggml_backend_openvino_free,
616
+ /* .set_tensor_async = */ NULL,
617
+ /* .get_tensor_async = */ NULL,
618
+ /* .cpy_tensor_async = */ NULL,
619
+ /* .synchronize = */ NULL,
620
+ /* .graph_plan_create = */ NULL,
621
+ /* .graph_plan_free = */ NULL,
622
+ /* .graph_plan_update = */ NULL,
623
+ /* .graph_plan_compute = */ NULL,
624
+ /* .graph_compute = */ ggml_backend_openvino_graph_compute,
625
+ /* .event_record = */ NULL,
626
+ /* .event_wait = */ NULL,
627
+ /* .graph_optimize = */ NULL,
628
+ };
629
+
630
+ int ggml_backend_openvino_get_device_count() {
631
+ return 1;
632
+ }
633
+
634
+ static ggml_guid_t ggml_backend_openvino_guid(void) {
635
+ static ggml_guid guid = {0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97,
636
+ 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d};
637
+ return &guid;
638
+ }
639
+
640
+ static std::shared_ptr<ov_runtime_context> get_ov_runtime_context_ptr() {
641
+ static std::shared_ptr<ov_runtime_context> r_ctx = std::make_shared<ov_runtime_context>();
642
+ return r_ctx;
643
+ }
644
+
645
+ // backend API
646
+ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
647
+ if (device < 0 || device >= ggml_backend_openvino_get_device_count()) {
648
+ GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
649
+ return nullptr;
650
+ }
651
+
652
+ ggml_backend_openvino_context * ctx = new ggml_backend_openvino_context;
653
+ if (ctx == nullptr) {
654
+ GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
655
+ return nullptr;
656
+ }
657
+
658
+ ctx->runtime_context = get_ov_runtime_context_ptr();
659
+ if (ctx->runtime_context == nullptr) {
660
+ GGML_LOG_ERROR("%s: failed to allocate runtime context\n", __func__);
661
+ delete ctx;
662
+ return nullptr;
663
+ }
664
+
665
+ std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
666
+ r_ctx->device = ggml_openvino_get_device_name();
667
+ r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
668
+
669
+ ggml_backend_t openvino_backend = new ggml_backend{
670
+ /* .guid = */ ggml_backend_openvino_guid(),
671
+ /* .interface = */ ggml_backend_openvino_interface,
672
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device),
673
+ /* .context = */ ctx,
674
+ };
675
+
676
+ return openvino_backend;
677
+ }
678
+
679
+ GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) {
680
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_openvino_guid());
681
+ }
682
+
683
+ struct ggml_backend_openvino_device_context {
684
+ int device;
685
+ std::string name;
686
+ std::string description;
687
+ };
688
+
689
+ static const char * ggml_backend_openvino_device_get_name(ggml_backend_dev_t dev) {
690
+ ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
691
+ return ctx->name.c_str();
692
+ }
693
+
694
+ static const char * ggml_backend_openvino_device_get_description(ggml_backend_dev_t dev) {
695
+ ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
696
+ return ctx->description.c_str();
697
+ }
698
+
699
+ static void ggml_backend_openvino_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
700
+ #ifdef _WIN32
701
+ MEMORYSTATUSEX status;
702
+ status.dwLength = sizeof(status);
703
+ GlobalMemoryStatusEx(&status);
704
+ *total = status.ullTotalPhys;
705
+ *free = status.ullAvailPhys;
706
+ #else
707
+ long pages = sysconf(_SC_PHYS_PAGES);
708
+ long page_size = sysconf(_SC_PAGE_SIZE);
709
+ *total = pages * page_size;
710
+
711
+ // "free" system memory is ill-defined, for practical purposes assume that all of it is free:
712
+ *free = *total;
713
+ #endif // _WIN32
714
+
715
+ GGML_UNUSED(dev);
716
+ }
717
+
718
+ static enum ggml_backend_dev_type ggml_backend_openvino_device_get_type(ggml_backend_dev_t dev) {
719
+ GGML_UNUSED(dev);
720
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
721
+ }
722
+
723
+ static void ggml_backend_openvino_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
724
+ props->name = ggml_backend_openvino_device_get_name(dev);
725
+ props->description = ggml_backend_openvino_device_get_description(dev);
726
+ props->type = ggml_backend_openvino_device_get_type(dev);
727
+ ggml_backend_openvino_device_get_memory(dev, &props->memory_free, &props->memory_total);
728
+
729
+ props->caps = {
730
+ /* .async = */ false,
731
+ /* .host_buffer = */ false,
732
+ /* .buffer_from_host_ptr = */ false,
733
+ /* .events = */ false,
734
+ };
735
+ }
736
+
737
+ static ggml_backend_t ggml_backend_openvino_device_init(ggml_backend_dev_t dev, const char * params) {
738
+ GGML_UNUSED(params);
739
+ ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
740
+ return ggml_backend_openvino_init(ctx->device);
741
+ }
742
+
743
+ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(ggml_backend_dev_t dev) {
744
+ ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
745
+ return ggml_backend_openvino_buffer_type(ctx->device);
746
+ }
747
+
748
+ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_type(ggml_backend_dev_t dev) {
749
+ ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
750
+ return ggml_backend_openvino_host_buffer_type(ctx->device);
751
+ }
752
+
753
+ static bool has_view_op_input(const ggml_tensor * op) {
754
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
755
+ if (op->src[i] == nullptr) {
756
+ break;
757
+ }
758
+ if (op->src[i]->op == GGML_OP_VIEW) {
759
+ return true;
760
+ }
761
+ }
762
+ return false;
763
+ }
764
+
765
+ static bool is_supported_flash_attn_pattern(const ggml_tensor * op) {
766
+ // pattern of q,k,v should be q->op==PERMUTE, q->src[0]->op==VIEW, q->src[0]->src[0]->view_src==nullptr
767
+ for (int i = 0; i < 3; i++) {
768
+ const ggml_tensor * src = op->src[i];
769
+ if (src->op != GGML_OP_PERMUTE || src->src[0] == nullptr || src->src[0]->op != GGML_OP_VIEW ||
770
+ src->src[0]->src[0] == nullptr || src->src[0]->src[0]->view_src != nullptr) {
771
+ return false;
772
+ }
773
+ }
774
+ return true;
775
+ }
776
+
777
+ static bool is_op_unsupported_case(const ggml_tensor * op) {
778
+ switch (op->op) {
779
+ case GGML_OP_GET_ROWS:
780
+ case GGML_OP_SET_ROWS: {
781
+ if (op->ne[3] != 1) {
782
+ return true;
783
+ }
784
+ break;
785
+ }
786
+ case GGML_OP_ADD:
787
+ case GGML_OP_MUL: {
788
+ if (op->src[1]->op == GGML_OP_PERMUTE) {
789
+ return true;
790
+ }
791
+ for (int i = 0; i < 4; i++) {
792
+ if (op->src[0]->ne[i] != op->src[1]->ne[i] && (op->src[0]->ne[i] != 1 && op->src[1]->ne[i] != 1)) {
793
+ return true;
794
+ }
795
+ }
796
+ break;
797
+ }
798
+ case GGML_OP_SOFT_MAX: {
799
+ if (op->src[2] != nullptr) {
800
+ // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
801
+ return true;
802
+ }
803
+ float scale = 1.0f;
804
+ float max_bias = 0.0f;
805
+ const auto * op_params = op->op_params;
806
+ memcpy(&scale, (const float *) op_params + 0, sizeof(float));
807
+ memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
808
+ if (max_bias > 0) {
809
+ // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
810
+ return true;
811
+ }
812
+ break;
813
+ }
814
+ case GGML_OP_FLASH_ATTN_EXT: {
815
+ if (op->src[4] != nullptr) {
816
+ // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n");
817
+ return true;
818
+ }
819
+ if (!is_supported_flash_attn_pattern(op)) {
820
+ return true;
821
+ }
822
+ float scale = 1.0f;
823
+ float max_bias = 0.0f;
824
+ float logit_softcap = 0.0f;
825
+ const auto * op_params = op->op_params;
826
+ memcpy(&scale, (const float *) op_params + 0, sizeof(float));
827
+ memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
828
+ memcpy(&logit_softcap, (const float *) op_params + 2, sizeof(float));
829
+ if (max_bias > 0) {
830
+ // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n");
831
+ return true;
832
+ }
833
+ if (logit_softcap != 0) {
834
+ // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with logit_softcap != 0\n");
835
+ return true;
836
+ }
837
+ break;
838
+ }
839
+ case GGML_OP_PERMUTE: {
840
+ if (op->type == GGML_TYPE_BF16) {
841
+ // err msg: [GPU] Could not find a suitable kernel for transpose
842
+ // GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n");
843
+ return true;
844
+ }
845
+ break;
846
+ }
847
+ case GGML_OP_CPY: {
848
+ if (op->src[1] != op) {
849
+ // GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n");
850
+ return true;
851
+ }
852
+ break;
853
+ }
854
+ case GGML_OP_MUL_MAT: {
855
+ if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) {
856
+ // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"`
857
+ // GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n");
858
+ return true;
859
+ }
860
+ if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) {
861
+ return true;
862
+ }
863
+ if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) {
864
+ return true;
865
+ }
866
+ if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) {
867
+ // MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1)
868
+ // triggers a bug in ov matmul_shape_inference.hpp
869
+ return true;
870
+ }
871
+ if (op->src[0]->op == GGML_OP_VIEW && op->src[1]->op == GGML_OP_VIEW) {
872
+ return true;
873
+ }
874
+ break;
875
+ }
876
+ case GGML_OP_ROPE: {
877
+ const int32_t * op_params = op->op_params;
878
+ const int n_dims = op_params[1];
879
+ const int mode = op_params[2];
880
+ if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
881
+ // GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
882
+ return true;
883
+ }
884
+ if (n_dims != 0.0f && n_dims != op->src[0]->ne[0]) {
885
+ // GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", n_dims,
886
+ // op->src[0]->ne[0]);
887
+ return true;
888
+ }
889
+ if (op->type != GGML_TYPE_F32) {
890
+ // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
891
+ return true;
892
+ }
893
+ float freq_scale;
894
+ float ext_factor;
895
+ memcpy(&freq_scale, op_params + 6, sizeof(float));
896
+ memcpy(&ext_factor, op_params + 7, sizeof(float));
897
+ if (ext_factor != 0.0f) {
898
+ // GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
899
+ return true;
900
+ }
901
+ if (op->src[0]->op == GGML_OP_VIEW) {
902
+ if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
903
+ // GGML_LOG_WARN(
904
+ // "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] "
905
+ // "%ld\n",
906
+ // op->src[0]->view_src->ne[1], op->src[0]->ne[2]);
907
+ return true;
908
+ }
909
+ }
910
+ break;
911
+ }
912
+ default:
913
+ break;
914
+ }
915
+ if (op->op == GGML_OP_GET_ROWS) {
916
+ if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) {
917
+ // ERR = 0.000000306 > 0.000000100 GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
918
+ // ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
919
+ return true;
920
+ }
921
+ }
922
+ return false;
923
+ }
924
+
925
+ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
926
+ GGML_ASSERT(dev->reg != nullptr);
927
+
928
+ static std::set<ggml_type> supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64,
929
+ GGML_TYPE_I32, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K,
930
+ GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
931
+
932
+ static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
933
+ /*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
934
+ GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
935
+ // softmax is not updated due to replaced by flash_attn_ext
936
+ // GGML_OP_SOFT_MAX,
937
+ GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
938
+ static const std::set<ggml_unary_op> supported_unary_ops{
939
+ GGML_UNARY_OP_SILU,
940
+ };
941
+ static const std::set<ggml_glu_op> supported_glu_ops{
942
+ GGML_GLU_OP_SWIGLU,
943
+ GGML_GLU_OP_GEGLU,
944
+ };
945
+
946
+ switch (op->op) {
947
+ case GGML_OP_UNARY: {
948
+ auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end();
949
+ if (!supported) {
950
+ // GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op)));
951
+ return false;
952
+ }
953
+ if (has_view_op_input(op)) {
954
+ // GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n",
955
+ // ggml_unary_op_name(ggml_get_unary_op(op)));
956
+ return false;
957
+ }
958
+ break;
959
+ }
960
+ case GGML_OP_GLU: {
961
+ auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end();
962
+ if (!supported) {
963
+ // GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", ggml_glu_op_name(ggml_get_glu_op(op)));
964
+ return false;
965
+ }
966
+ if (has_view_op_input(op)) {
967
+ // GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n",
968
+ // ggml_glu_op_name(ggml_get_glu_op(op)));
969
+ return false;
970
+ }
971
+ if (op->src[1] == nullptr && op->src[0]->ne[0] % 2 != 0) {
972
+ // triggers bug in ov gpu
973
+ return false;
974
+ }
975
+ break;
976
+ }
977
+ default: {
978
+ auto supported = supported_ops.find(op->op) != supported_ops.end();
979
+ if (!supported) {
980
+ // GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op));
981
+ return false;
982
+ }
983
+ static std::set<ggml_op> ops_not_support_view_input{
984
+ GGML_OP_GET_ROWS,
985
+ GGML_OP_RMS_NORM,
986
+ };
987
+ if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_op_input(op)) {
988
+ // GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op));
989
+ return false;
990
+ }
991
+ }
992
+ }
993
+
994
+ if (supported_types.find(op->type) == supported_types.end()) {
995
+ // GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type));
996
+ return false;
997
+ }
998
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
999
+ auto * src = op->src[i];
1000
+ if (src == nullptr) {
1001
+ break;
1002
+ }
1003
+ if (supported_types.find(src->type) == supported_types.end()) {
1004
+ // GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type));
1005
+ return false;
1006
+ }
1007
+ if (ggml_is_quantized(src->type) && src->ne[2] != 1) {
1008
+ // GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n");
1009
+ return false;
1010
+ }
1011
+ }
1012
+
1013
+ if (is_op_unsupported_case(op)) {
1014
+ return false;
1015
+ }
1016
+ return true;
1017
+ }
1018
+
1019
+ static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
1020
+ return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft);
1021
+ GGML_UNUSED(dev);
1022
+ }
1023
+
1024
+ static const struct ggml_backend_device_i ggml_backend_openvino_device_interface = {
1025
+ /* .get_name = */ ggml_backend_openvino_device_get_name,
1026
+ /* .get_description = */ ggml_backend_openvino_device_get_description,
1027
+ /* .get_memory = */ ggml_backend_openvino_device_get_memory,
1028
+ /* .get_type = */ ggml_backend_openvino_device_get_type,
1029
+ /* .get_props = */ ggml_backend_openvino_device_get_props,
1030
+ /* .init_backend = */ ggml_backend_openvino_device_init,
1031
+ /* .get_buffer_type = */ ggml_backend_openvino_device_get_buffer_type,
1032
+ /* .get_host_buffer_type = */ ggml_backend_openvino_device_get_host_buffer_type,
1033
+ /* .buffer_from_host_ptr = */ NULL,
1034
+ /* .supports_op = */ ggml_backend_openvino_device_supports_op,
1035
+ /* .supports_buft = */ ggml_backend_openvino_device_supports_buft,
1036
+ /* .offload_op = */ NULL,
1037
+ /* .event_new = */ NULL,
1038
+ /* .event_free = */ NULL,
1039
+ /* .event_synchronize = */ NULL,
1040
+ };
1041
+
1042
+ struct ggml_backend_openvino_reg_context {
1043
+ std::vector<ggml_backend_dev_t> devices;
1044
+ };
1045
+
1046
+ static const char * ggml_backend_openvino_reg_get_name(ggml_backend_reg_t reg) {
1047
+ return GGML_OPENVINO_NAME;
1048
+ GGML_UNUSED(reg);
1049
+ }
1050
+
1051
+ static size_t ggml_backend_openvino_reg_get_device_count(ggml_backend_reg_t reg) {
1052
+ GGML_UNUSED(reg);
1053
+ return (size_t) ggml_backend_openvino_get_device_count();
1054
+ }
1055
+
1056
+ static ggml_backend_dev_t ggml_backend_openvino_reg_get_device(ggml_backend_reg_t reg, size_t index) {
1057
+ ggml_backend_openvino_reg_context * ctx = (ggml_backend_openvino_reg_context *) reg->context;
1058
+ GGML_ASSERT(index < ctx->devices.size());
1059
+ return ctx->devices[index];
1060
+ }
1061
+
1062
+ static const struct ggml_backend_reg_i ggml_backend_openvino_reg_interface = {
1063
+ /* .get_name = */ ggml_backend_openvino_reg_get_name,
1064
+ /* .get_device_count = */ ggml_backend_openvino_reg_get_device_count,
1065
+ /* .get_device = */ ggml_backend_openvino_reg_get_device,
1066
+ /* .get_proc_address = */ NULL,
1067
+ };
1068
+
1069
+ static void ggml_openvino_init() {
1070
+ // Initialize device config singleton from env var
1071
+ ggml_openvino_init_device_config();
1072
+ GGML_LOG_INFO("OpenVINO: using device %s\n", ggml_openvino_get_device_name().c_str());
1073
+ }
1074
+
1075
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void) {
1076
+ static ggml_backend_reg reg;
1077
+
1078
+ static bool initialized = false;
1079
+ {
1080
+ static std::mutex mutex;
1081
+ std::lock_guard<std::mutex> lock(mutex);
1082
+ if (!initialized) {
1083
+ ggml_openvino_init();
1084
+
1085
+ ggml_backend_openvino_reg_context * ctx = new ggml_backend_openvino_reg_context;
1086
+
1087
+ for (int i = 0; i < ggml_backend_openvino_get_device_count(); i++) {
1088
+ ggml_backend_openvino_device_context * dev_ctx = new ggml_backend_openvino_device_context;
1089
+ dev_ctx->device = i;
1090
+ dev_ctx->name = GGML_OPENVINO_NAME + std::to_string(i);
1091
+
1092
+ dev_ctx->description = ov::get_openvino_version().description;
1093
+
1094
+ ggml_backend_dev_t dev =
1095
+ new ggml_backend_device{/* .interface = */ ggml_backend_openvino_device_interface,
1096
+ /* .reg = */ &reg,
1097
+ /* .context = */ dev_ctx};
1098
+ ctx->devices.push_back(dev);
1099
+ }
1100
+
1101
+ reg = ggml_backend_reg{/* .api_version = */ GGML_BACKEND_API_VERSION,
1102
+ /* .iface = */ ggml_backend_openvino_reg_interface,
1103
+ /* .context = */ ctx};
1104
+ }
1105
+
1106
+ initialized = true;
1107
+ }
1108
+
1109
+ return &reg;
1110
+ }