whispercpp 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (891) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +158 -44
  4. data/ext/extconf.rb +3 -2
  5. data/ext/ruby_whisper.c +34 -6
  6. data/ext/ruby_whisper.h +67 -0
  7. data/ext/ruby_whisper_context.c +236 -144
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +12 -13
  10. data/ext/ruby_whisper_params.c +47 -24
  11. data/ext/ruby_whisper_segment.c +84 -20
  12. data/ext/ruby_whisper_token.c +371 -0
  13. data/ext/ruby_whisper_transcribe.cpp +5 -2
  14. data/ext/ruby_whisper_vad_context.c +122 -0
  15. data/ext/ruby_whisper_vad_context_detect.cpp +51 -0
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +138 -0
  18. data/ext/ruby_whisper_vad_segments.c +105 -0
  19. data/ext/sources/CMakeLists.txt +4 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
  22. data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
  23. data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  24. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  25. data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
  26. data/ext/sources/examples/addon.node/vad-example.js +2 -2
  27. data/ext/sources/examples/bench/bench.cpp +23 -18
  28. data/ext/sources/examples/cli/cli.cpp +129 -112
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
  31. data/ext/sources/examples/miniaudio.h +4507 -2131
  32. data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
  33. data/ext/sources/examples/server/server.cpp +28 -15
  34. data/ext/sources/examples/talk-llama/CMakeLists.txt +8 -3
  35. data/ext/sources/examples/talk-llama/llama-adapter.cpp +5 -2
  36. data/ext/sources/examples/talk-llama/llama-adapter.h +7 -0
  37. data/ext/sources/examples/talk-llama/llama-arch.cpp +2378 -1988
  38. data/ext/sources/examples/talk-llama/llama-arch.h +109 -2
  39. data/ext/sources/examples/talk-llama/llama-batch.cpp +78 -34
  40. data/ext/sources/examples/talk-llama/llama-batch.h +17 -4
  41. data/ext/sources/examples/talk-llama/llama-chat.cpp +100 -4
  42. data/ext/sources/examples/talk-llama/llama-chat.h +5 -0
  43. data/ext/sources/examples/talk-llama/llama-context.cpp +1088 -403
  44. data/ext/sources/examples/talk-llama/llama-context.h +70 -23
  45. data/ext/sources/examples/talk-llama/llama-cparams.h +6 -0
  46. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  47. data/ext/sources/examples/talk-llama/llama-grammar.cpp +295 -60
  48. data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
  49. data/ext/sources/examples/talk-llama/llama-graph.cpp +925 -155
  50. data/ext/sources/examples/talk-llama/llama-graph.h +234 -23
  51. data/ext/sources/examples/talk-llama/llama-hparams.cpp +79 -38
  52. data/ext/sources/examples/talk-llama/llama-hparams.h +118 -18
  53. data/ext/sources/examples/talk-llama/llama-impl.cpp +11 -7
  54. data/ext/sources/examples/talk-llama/llama-impl.h +14 -2
  55. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +8 -4
  56. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +405 -140
  57. data/ext/sources/examples/talk-llama/llama-kv-cache.h +24 -10
  58. data/ext/sources/examples/talk-llama/llama-kv-cells.h +44 -2
  59. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  60. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  61. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +12 -10
  62. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +42 -31
  63. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +2 -2
  64. data/ext/sources/examples/talk-llama/llama-mmap.cpp +197 -45
  65. data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
  66. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +606 -116
  67. data/ext/sources/examples/talk-llama/llama-model-loader.h +41 -5
  68. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +61 -44
  69. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  70. data/ext/sources/examples/talk-llama/llama-model.cpp +2756 -13643
  71. data/ext/sources/examples/talk-llama/llama-model.h +112 -18
  72. data/ext/sources/examples/talk-llama/llama-quant.cpp +582 -365
  73. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +1409 -199
  74. data/ext/sources/examples/talk-llama/llama-sampler.h +42 -0
  75. data/ext/sources/examples/talk-llama/llama-vocab.cpp +248 -82
  76. data/ext/sources/examples/talk-llama/llama-vocab.h +50 -40
  77. data/ext/sources/examples/talk-llama/llama.cpp +802 -21
  78. data/ext/sources/examples/talk-llama/llama.h +210 -39
  79. data/ext/sources/examples/talk-llama/models/afmoe.cpp +190 -0
  80. data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
  81. data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
  82. data/ext/sources/examples/talk-llama/models/arctic.cpp +137 -0
  83. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
  84. data/ext/sources/examples/talk-llama/models/baichuan.cpp +123 -0
  85. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +143 -0
  86. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +133 -0
  87. data/ext/sources/examples/talk-llama/models/bert.cpp +184 -0
  88. data/ext/sources/examples/talk-llama/models/bitnet.cpp +145 -0
  89. data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
  90. data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
  91. data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
  92. data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
  93. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
  94. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
  95. data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
  96. data/ext/sources/examples/talk-llama/models/dbrx.cpp +122 -0
  97. data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
  98. data/ext/sources/examples/talk-llama/models/deepseek.cpp +142 -0
  99. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +262 -0
  100. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  101. data/ext/sources/examples/talk-llama/models/dots1.cpp +132 -0
  102. data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
  103. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +148 -0
  104. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
  105. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  106. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  107. data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
  108. data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
  109. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +111 -0
  110. data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
  111. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
  112. data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
  113. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
  114. data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
  115. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
  116. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
  117. data/ext/sources/examples/talk-llama/models/glm4.cpp +157 -0
  118. data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
  119. data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
  120. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +195 -0
  121. data/ext/sources/examples/talk-llama/models/granite.cpp +210 -0
  122. data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
  123. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +139 -0
  124. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
  125. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +153 -0
  126. data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
  127. data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
  128. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  129. data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
  130. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  131. data/ext/sources/examples/talk-llama/models/lfm2.cpp +196 -0
  132. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
  133. data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
  134. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
  135. data/ext/sources/examples/talk-llama/models/llama.cpp +175 -0
  136. data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
  137. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +289 -0
  138. data/ext/sources/examples/talk-llama/models/mamba.cpp +54 -0
  139. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +129 -0
  140. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +200 -0
  141. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +123 -0
  142. data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
  143. data/ext/sources/examples/talk-llama/models/models.h +704 -0
  144. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +109 -0
  145. data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
  146. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +162 -0
  147. data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
  148. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
  149. data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
  150. data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
  151. data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
  152. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
  153. data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
  154. data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
  155. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  156. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
  157. data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
  158. data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
  159. data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
  160. data/ext/sources/examples/talk-llama/models/plamo2.cpp +320 -0
  161. data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
  162. data/ext/sources/examples/talk-llama/models/plm.cpp +169 -0
  163. data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
  164. data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
  165. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
  166. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
  167. data/ext/sources/examples/talk-llama/models/qwen3.cpp +120 -0
  168. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  169. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  170. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +131 -0
  171. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +525 -0
  172. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +140 -0
  173. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +132 -0
  174. data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
  175. data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
  176. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +164 -0
  177. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
  178. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
  179. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +137 -0
  180. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
  181. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
  182. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
  183. data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
  184. data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
  185. data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
  186. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
  187. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  188. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
  189. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
  190. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
  191. data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
  192. data/ext/sources/examples/talk-llama/unicode.cpp +121 -79
  193. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
  194. data/ext/sources/examples/whisper.wasm/index-tmpl.html +1 -1
  195. data/ext/sources/ggml/CMakeLists.txt +90 -56
  196. data/ext/sources/ggml/include/ggml-alloc.h +9 -0
  197. data/ext/sources/ggml/include/ggml-backend.h +5 -2
  198. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  199. data/ext/sources/ggml/include/ggml-cpu.h +6 -0
  200. data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
  201. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  202. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  203. data/ext/sources/ggml/include/ggml-rpc.h +14 -12
  204. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  205. data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
  206. data/ext/sources/ggml/include/ggml.h +246 -21
  207. data/ext/sources/ggml/src/CMakeLists.txt +85 -11
  208. data/ext/sources/ggml/src/ggml-alloc.c +128 -50
  209. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  210. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  211. data/ext/sources/ggml/src/ggml-backend-impl.h +1 -4
  212. data/ext/sources/ggml/src/ggml-backend-reg.cpp +54 -88
  213. data/ext/sources/ggml/src/ggml-backend.cpp +76 -23
  214. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +18 -4
  215. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +11 -11
  216. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +58 -46
  217. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +139 -48
  218. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2427 -1785
  219. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +238 -362
  220. data/ext/sources/ggml/src/ggml-cann/common.h +285 -211
  221. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +663 -831
  222. data/ext/sources/ggml/src/ggml-common.h +11 -0
  223. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +170 -95
  224. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -18
  225. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  226. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  227. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  228. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +513 -27
  229. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +4192 -992
  230. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  231. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  232. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1761 -49
  233. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  234. data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  235. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  236. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  237. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +124 -24
  238. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +157 -28
  239. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  240. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  241. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -3
  242. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +251 -80
  243. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +19 -0
  244. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +587 -119
  245. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +33 -44
  246. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1093 -194
  247. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1284 -203
  248. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  249. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1519 -527
  250. data/ext/sources/ggml/src/ggml-cpu/ops.h +6 -4
  251. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  252. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  253. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3632 -781
  254. data/ext/sources/ggml/src/ggml-cpu/repack.h +129 -4
  255. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  256. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +152 -46
  257. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
  258. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +152 -1
  259. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
  260. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +140 -0
  261. data/ext/sources/ggml/src/ggml-cpu/vec.h +261 -146
  262. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +72 -1
  263. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
  264. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +132 -6
  265. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
  266. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +33 -31
  267. data/ext/sources/ggml/src/ggml-cuda/common.cuh +474 -85
  268. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  269. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  270. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  271. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +342 -246
  272. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
  273. data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
  274. data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  275. data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
  276. data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
  277. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +98 -74
  278. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +973 -665
  279. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +35 -741
  280. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1255 -0
  281. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +33 -40
  282. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +40 -18
  283. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  284. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +206 -45
  285. data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
  286. data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
  287. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  288. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  289. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1688 -302
  290. data/ext/sources/ggml/src/ggml-cuda/mean.cu +12 -10
  291. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +908 -48
  292. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +88 -20
  293. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +502 -90
  294. data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
  295. data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
  296. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +69 -176
  297. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +532 -193
  298. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +460 -104
  299. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +5 -2
  300. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +360 -122
  301. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +2 -1
  302. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  303. data/ext/sources/ggml/src/ggml-cuda/pad.cu +73 -39
  304. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +152 -1
  305. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
  306. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  307. data/ext/sources/ggml/src/ggml-cuda/rope.cu +364 -149
  308. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
  309. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +101 -47
  310. data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
  311. data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
  312. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +163 -41
  313. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  314. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  315. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +68 -50
  316. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  317. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +49 -84
  318. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  319. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  320. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  321. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  322. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  323. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  324. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  325. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  326. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  328. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  329. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  330. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  332. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +22 -4
  334. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +95 -0
  335. data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
  336. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +275 -119
  337. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -7
  338. data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
  339. data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
  340. data/ext/sources/ggml/src/ggml-cuda/unary.cu +160 -11
  341. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +38 -0
  342. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +163 -7
  343. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  344. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  345. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -1
  346. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +6 -0
  347. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  348. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3325 -0
  349. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +46 -0
  350. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +813 -0
  351. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  352. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +891 -0
  353. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  354. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  355. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +713 -0
  356. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
  357. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  358. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  359. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  360. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  361. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  362. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  363. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +155 -0
  364. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +63 -0
  365. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  366. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  367. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  368. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  369. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  370. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  371. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  372. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  373. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  374. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  375. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  376. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  377. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  378. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  379. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +26 -0
  380. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1199 -0
  381. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2670 -0
  382. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +497 -0
  383. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
  384. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +419 -0
  385. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  386. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  387. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +382 -0
  388. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  389. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  390. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  391. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  392. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  393. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  394. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
  395. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -13
  396. data/ext/sources/ggml/src/ggml-impl.h +129 -6
  397. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  398. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +15 -4
  399. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  400. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +173 -34
  401. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +912 -344
  402. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +124 -59
  403. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +588 -144
  404. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +396 -23
  405. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +1724 -421
  406. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +16 -3
  407. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +333 -114
  408. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3050 -1539
  409. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  410. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +30 -1
  411. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4279 -497
  412. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  413. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  414. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  415. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +267 -0
  416. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  417. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  418. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  419. data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  420. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +4 -3
  421. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  422. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  423. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  424. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  425. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  426. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  427. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  428. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  429. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  430. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  431. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  432. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  433. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  434. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  435. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  436. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  437. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  438. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  439. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  440. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  441. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  442. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  443. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  444. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  445. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  446. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  447. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  448. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +35 -16
  449. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  450. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  451. data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  452. data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  453. data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  454. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  455. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  456. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +39 -0
  457. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  458. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  459. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  460. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  461. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  462. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  463. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  464. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  465. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  466. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  467. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  468. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  469. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  470. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  471. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  472. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  473. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  474. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  475. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  476. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  477. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  478. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  479. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  480. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  481. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  482. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  483. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  484. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  485. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  486. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  487. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  488. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  489. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  490. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  491. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  492. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  493. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  494. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  495. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  496. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  497. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  498. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  499. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  500. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  501. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  502. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  503. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  504. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  505. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  506. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  507. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +438 -156
  508. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +59 -87
  509. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +81 -0
  510. data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
  511. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +7 -0
  512. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -29
  513. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +0 -6
  514. data/ext/sources/ggml/src/ggml-sycl/common.hpp +427 -20
  515. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +55 -44
  516. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +103 -1
  517. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  518. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  519. data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  520. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +0 -3
  521. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  522. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +867 -50
  523. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +401 -358
  524. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
  525. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  526. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  527. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  528. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  529. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  530. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  531. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  532. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  533. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +645 -155
  534. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +22 -0
  535. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +221 -66
  536. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
  537. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  538. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
  539. data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
  540. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  541. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  542. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +5 -0
  543. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  544. data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  545. data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  546. data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
  547. data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
  548. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +457 -281
  549. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  550. data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
  551. data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
  552. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +327 -162
  553. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
  554. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  555. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  556. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  557. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  558. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  559. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  560. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  561. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  562. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  563. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  564. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  565. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  566. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  567. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  568. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  569. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  570. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  571. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  572. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  573. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  574. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  575. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  576. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  577. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  578. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  579. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  580. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  581. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  582. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  583. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  584. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  585. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  586. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  587. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  588. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  589. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  590. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  591. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  592. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  593. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  594. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  595. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  596. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  597. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  598. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  599. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  600. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  601. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +71 -0
  602. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  603. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  604. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  605. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  606. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  607. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  608. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  609. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  610. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  611. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  612. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  613. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  614. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  615. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  616. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  617. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  618. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  619. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  620. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  621. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  622. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  623. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  624. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  625. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  626. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  627. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  628. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  629. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  630. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  631. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  632. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  633. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  634. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  635. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  636. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  637. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  638. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  639. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  640. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  641. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  642. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  643. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  644. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +39 -19
  646. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +5994 -3055
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +18 -10
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +47 -49
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +4 -4
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +9 -21
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +1 -1
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +1 -1
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +1 -1
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -3
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +386 -160
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +82 -20
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +400 -174
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +123 -37
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +10 -9
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +17 -2
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +4 -4
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +3 -3
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +2 -2
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +2 -2
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +19 -7
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +2 -3
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +13 -10
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mat_vec_base.comp → mul_mat_vec_base.glsl} +77 -29
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
  740. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  743. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  744. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
  745. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
  746. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
  747. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
  748. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
  749. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +39 -36
  750. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
  751. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +88 -105
  752. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +41 -26
  753. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{mul_mm_funcs.comp → mul_mm_funcs.glsl} +69 -59
  754. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  755. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +92 -230
  756. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  757. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  758. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  759. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  760. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  761. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  762. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  763. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +21 -6
  764. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  765. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +10 -10
  766. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  767. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  768. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  769. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  770. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -4
  771. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  772. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  773. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  774. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  775. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  776. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +8 -49
  777. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +8 -32
  778. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +8 -32
  779. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +33 -0
  780. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +8 -38
  781. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  782. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  783. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  784. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  785. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  786. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  787. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  788. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  789. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +2 -2
  790. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  791. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  792. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  793. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  794. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  795. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  796. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  798. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  799. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  800. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  801. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  802. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +2 -25
  803. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  804. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  805. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  806. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  807. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +1 -1
  808. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  809. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  810. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  811. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
  812. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  813. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +345 -26
  814. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +90 -12
  815. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +384 -180
  816. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  817. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +28 -2
  818. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1374 -0
  819. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2544 -726
  820. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  821. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  822. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  823. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  824. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  825. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  826. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  827. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  828. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  829. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +73 -15
  830. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  831. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  832. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  833. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +72 -261
  834. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +766 -0
  835. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +147 -0
  836. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +196 -0
  837. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  838. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  839. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  840. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +83 -17
  841. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  842. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  843. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  844. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  845. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  846. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  847. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  848. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  849. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  850. data/ext/sources/ggml/src/ggml.c +590 -64
  851. data/ext/sources/ggml/src/gguf.cpp +229 -44
  852. data/ext/sources/include/whisper.h +1 -0
  853. data/ext/sources/src/CMakeLists.txt +3 -1
  854. data/ext/sources/src/whisper.cpp +106 -62
  855. data/ext/sources/tests/CMakeLists.txt +2 -2
  856. data/ext/sources/tests/test-vad-full.cpp +4 -2
  857. data/ext/sources/tests/test-vad.cpp +1 -1
  858. data/extsources.rb +1 -0
  859. data/lib/whisper/model/uri.rb +17 -18
  860. data/sig/whisper.rbs +162 -4
  861. data/test/test_context_params.rb +82 -0
  862. data/test/test_params.rb +16 -8
  863. data/test/test_segment.rb +0 -1
  864. data/test/test_token.rb +81 -0
  865. data/test/test_vad.rb +1 -1
  866. data/test/test_vad_context.rb +100 -0
  867. data/test/test_vad_segment.rb +19 -0
  868. data/test/test_vad_segments.rb +16 -0
  869. data/test/test_whisper.rb +27 -0
  870. data/whispercpp.gemspec +1 -1
  871. metadata +502 -37
  872. data/ext/sources/build-xcframework.sh +0 -571
  873. data/ext/sources/examples/talk-llama/llama-sampling.h +0 -32
  874. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  875. data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
  876. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  877. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  878. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +0 -44
  879. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +0 -41
  880. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  881. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  882. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +0 -44
  883. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +0 -41
  884. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +0 -48
  885. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  886. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  887. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  888. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  889. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  890. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  891. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -0,0 +1,975 @@
1
+ #include "ggml-decoder.h"
2
+
3
+ #include "ggml-backend-impl.h"
4
+ #include "ggml-backend.h"
5
+ #include "ggml-openvino-extra.h"
6
+ #include "ggml-openvino.h"
7
+ #include "ggml-quants.h"
8
+
9
+ #include <ggml-impl.h>
10
+ #include <ggml.h>
11
+
12
+ #include <algorithm>
13
+ #include <cassert>
14
+ #include <cstddef>
15
+ #include <cstdint>
16
+ #include <cstdlib>
17
+ #include <execution>
18
+ #include <fstream>
19
+ #include <iomanip>
20
+ #include <map>
21
+ #include <memory>
22
+ #include <mutex>
23
+ #include <openvino/core/dimension.hpp>
24
+ #include <openvino/core/except.hpp>
25
+ #include <openvino/core/node.hpp>
26
+ #include <openvino/core/partial_shape.hpp>
27
+ #include <openvino/core/type/bfloat16.hpp>
28
+ #include <openvino/core/type/element_type.hpp>
29
+ #include <openvino/core/type/float16.hpp>
30
+ #include <openvino/op/constant.hpp>
31
+ #include <openvino/op/convert.hpp>
32
+ #include <openvino/op/parameter.hpp>
33
+ #include <openvino/runtime/tensor.hpp>
34
+ #include <optional>
35
+ #include <ostream>
36
+ #include <set>
37
+ #include <stdexcept>
38
+ #include <string>
39
+ #include <unordered_map>
40
+ #include <vector>
41
+
42
+ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
43
+ ModelParams & model_params,
44
+ ComputeParams & compute_params,
45
+ std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
46
+ bool is_static,
47
+ bool is_stateful,
48
+ bool is_prefill,
49
+ int prefill_chunk_size) :
50
+ m_is_static(is_static),
51
+ m_is_stateful(is_stateful),
52
+ m_is_prefill(is_prefill),
53
+ m_naive(false),
54
+ m_prefill_chunk_size(prefill_chunk_size),
55
+ m_cgraph(cgraph),
56
+ m_model_weights(model_weights),
57
+ m_model_params(model_params),
58
+ m_compute_params(compute_params) {
59
+ if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") {
60
+ #ifdef _WIN32
61
+ _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", "");
62
+ #else
63
+ unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS");
64
+ #endif
65
+ print_tensor_address_map(cgraph);
66
+ }
67
+
68
+ validate_cgraph();
69
+
70
+ set_input_output();
71
+ compute_model_inputs();
72
+ compute_model_outputs();
73
+
74
+ for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
75
+ m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
76
+ m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
77
+ }
78
+
79
+ add_extra_inputs();
80
+ }
81
+
82
+ void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
83
+ m_cgraph = cgraph;
84
+ m_model_inputs.clear();
85
+ m_model_outputs.clear();
86
+ m_node_info_list.clear();
87
+ set_input_output();
88
+ compute_model_inputs();
89
+ compute_model_outputs();
90
+ }
91
+
92
+ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
93
+ m_cgraph = cgraph;
94
+ m_model_weights = model_weights;
95
+ m_naive = true;
96
+ set_input_output();
97
+ compute_model_inputs();
98
+ compute_model_outputs();
99
+ for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
100
+ m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
101
+ m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
102
+ }
103
+ }
104
+
105
+ void GgmlOvDecoder::set_input_output() {
106
+ for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
107
+ auto node = m_cgraph->nodes[node_n];
108
+
109
+ NodeInfo current_node_info;
110
+ auto node_name = std::string(node->name);
111
+ auto node_output_name = node_name;
112
+ auto * node_output = node;
113
+ if (node->op == GGML_OP_SET_ROWS) {
114
+ // SET_ROWS updates the tensor in place. For later ov op that uses the
115
+ // the view_src of SET_ROWS, we need to make sure they get the updated tensor
116
+ // by putting the view_src name in the tensor_map in
117
+ // <openvino>/src/frontends/ggml/src/translate_session.cpp
118
+ node_output_name = std::string(node->view_src->name);
119
+ node_output = node->view_src;
120
+ }
121
+
122
+ current_node_info.node = node;
123
+ current_node_info.node_name = node_name;
124
+ current_node_info.node_output = node_output;
125
+ current_node_info.node_output_name = node_output_name;
126
+ current_node_info.node_op_case = 0;
127
+ current_node_info.data_addr = node->data;
128
+
129
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
130
+ auto * src = node->src[i];
131
+ if (src == nullptr) {
132
+ continue;
133
+ }
134
+ auto src_name = std::string(src->name);
135
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
136
+ src_name = get_graph_input_ov_name(src, node);
137
+ }
138
+ current_node_info.node_inputs[src_name] = src;
139
+ current_node_info.node_inputs_names.push_back(src_name);
140
+ }
141
+
142
+ m_node_info_list.push_back(current_node_info);
143
+ }
144
+ }
145
+
146
+ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
147
+ int op_case = 0;
148
+ switch (node->op) {
149
+ case GGML_OP_RESHAPE: {
150
+ auto * src = node->src[0];
151
+ if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) {
152
+ op_case = 4;
153
+ } else if (node->ne[0] * node->ne[1] == src->ne[0]) {
154
+ op_case = 1;
155
+ } else if (src->ne[0] * src->ne[1] == node->ne[0]) {
156
+ op_case = 2;
157
+ if (src->ne[2] * src->ne[3] == node->ne[1]) {
158
+ op_case = 5;
159
+ }
160
+ } else if (src->ne[0] * src->ne[1] == node->ne[1]) {
161
+ op_case = 3;
162
+ } else if (src->ne[1] * src->ne[2] == node->ne[1]) {
163
+ op_case = 6;
164
+ }
165
+ break;
166
+ }
167
+ case GGML_OP_CONT: {
168
+ if (node->src[0]->op == GGML_OP_PERMUTE) {
169
+ op_case = 1;
170
+ } else if (node->src[0]->op == GGML_OP_TRANSPOSE) {
171
+ op_case = 2;
172
+ } else if (node->src[0]->op == GGML_OP_VIEW) {
173
+ op_case = 3;
174
+ }
175
+ break;
176
+ }
177
+ case GGML_OP_PERMUTE: {
178
+ if (node->src[0]->op != GGML_OP_VIEW) {
179
+ op_case = 1;
180
+ } else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
181
+ // kv cache tensor
182
+ std::string src_name(node->view_src->name);
183
+ int layer = extract_layer_from_name(src_name);
184
+ if (!is_swa_layer(layer)) {
185
+ op_case = 2;
186
+ } else {
187
+ op_case = 3;
188
+ }
189
+ } else {
190
+ // rope'ed query tensor
191
+ op_case = 4;
192
+ }
193
+ break;
194
+ }
195
+ case GGML_OP_MUL_MAT: {
196
+ if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) {
197
+ op_case = 2;
198
+ } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
199
+ op_case = 3;
200
+ }
201
+ break;
202
+ }
203
+ case GGML_OP_GET_ROWS: {
204
+ if (node->src[1]->op == GGML_OP_VIEW) {
205
+ op_case = 2;
206
+ }
207
+ break;
208
+ }
209
+ case GGML_OP_ROPE: {
210
+ if (node->src[0]->op == GGML_OP_VIEW) {
211
+ op_case = 2;
212
+ }
213
+ break;
214
+ }
215
+ case GGML_OP_VIEW: {
216
+ if (node->src[0]->op == GGML_OP_VIEW) {
217
+ auto * src = node->src[0];
218
+ if (ggml_nelements(node) != ggml_nelements(src)) {
219
+ throw std::runtime_error("Unsupported VIEW case");
220
+ }
221
+ op_case = 2;
222
+ }
223
+ {
224
+ auto * src = node->src[0];
225
+ if ((ggml_nelements(node) != ggml_nelements(src)) && m_naive) {
226
+ // Compare each dimension of node and src, if only one dimension differs then op_case=3
227
+ int diff_count = 0;
228
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
229
+ if (node->ne[i] != src->ne[i]) {
230
+ diff_count++;
231
+ }
232
+ }
233
+ if (diff_count == 1) {
234
+ op_case = 3;
235
+ }
236
+ }
237
+ }
238
+ break;
239
+ }
240
+ default:
241
+ break;
242
+ }
243
+ return op_case;
244
+ }
245
+
246
+ int extract_layer_from_name(const std::string & name) {
247
+ size_t pos1 = name.find("_l");
248
+ assert(pos1 != std::string::npos);
249
+ pos1 += 2;
250
+ size_t pos2 = name.find(' ', pos1);
251
+ if (pos2 == std::string::npos) {
252
+ pos2 = name.length();
253
+ }
254
+ std::string layer_str = name.substr(pos1, pos2 - pos1);
255
+ int layer = std::stoi(layer_str);
256
+ return layer;
257
+ }
258
+
259
+ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) {
260
+ ModelParams model_params;
261
+ ComputeParams compute_params;
262
+ for (int i = 0; i < cgraph->n_nodes; i++) {
263
+ auto * node = cgraph->nodes[i];
264
+ std::string name = std::string(node->name);
265
+ if (node->op == GGML_OP_FLASH_ATTN_EXT) {
266
+ model_params.n_heads = node->src[0]->ne[2];
267
+ model_params.n_heads_kv = node->src[1]->ne[2];
268
+ model_params.head_size = node->src[0]->ne[0];
269
+ compute_params.input_len = node->src[0]->ne[1];
270
+
271
+ auto * cache_k_perm = node->src[1];
272
+ if (cache_k_perm->op == GGML_OP_CPY) {
273
+ cache_k_perm = cache_k_perm->src[0];
274
+ }
275
+ assert(cache_k_perm->op == GGML_OP_PERMUTE);
276
+ auto * cache_k_view = cache_k_perm->src[0];
277
+ assert(cache_k_view->op == GGML_OP_VIEW);
278
+
279
+ auto * cache_k = cache_k_view->src[0];
280
+ int layer = extract_layer_from_name(cache_k->name);
281
+ auto * mask = node->src[3];
282
+ std::string mask_name(mask->name);
283
+
284
+ model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
285
+ if (mask_name.find("swa") != std::string::npos) {
286
+ model_params.swa_layers.push_back(layer);
287
+ model_params.ctx_per_seq_swa = cache_k->ne[1];
288
+ } else {
289
+ model_params.ctx_per_seq = cache_k->ne[1];
290
+ model_params.n_seq = cache_k->ne[2];
291
+ }
292
+
293
+ compute_params.n_seq_active = mask->ne[3];
294
+ auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type);
295
+ size_t offset;
296
+ memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
297
+ compute_params.seq_active_start = offset / seq_size;
298
+ compute_params.token_len_per_seq = node->ne[2];
299
+
300
+ if (mask_name.find("swa") != std::string::npos) {
301
+ compute_params.attention_size_swa = mask->ne[0];
302
+ } else {
303
+ compute_params.attention_size = mask->ne[0];
304
+ }
305
+ if (is_static) {
306
+ compute_params.attention_size = model_params.ctx_per_seq;
307
+ compute_params.attention_size_swa = model_params.ctx_per_seq_swa;
308
+ compute_params.token_len_per_seq = 1;
309
+ }
310
+ break;
311
+ }
312
+ if (node->op == GGML_OP_ROPE) {
313
+ memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
314
+ }
315
+ }
316
+ auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
317
+ compute_params.output_len = output_tensor->ne[1];
318
+ // for NPU, output_len is always 1 except for llama-perplexity
319
+ if (is_static && compute_params.output_len == 0) {
320
+ compute_params.output_len = 1;
321
+ }
322
+ model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
323
+ model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq;
324
+ return {model_params, compute_params};
325
+ }
326
+
327
+ void GgmlOvDecoder::validate_cgraph() const {
328
+ if (m_model_params.n_seq > 1 && m_is_static == true) {
329
+ throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
330
+ }
331
+ }
332
+
333
+ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
334
+ if (m_naive) {
335
+ return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
336
+ }
337
+ auto name = std::string(input->name);
338
+ ov::PartialShape input_shape;
339
+
340
+ if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
341
+ // tokens or positions
342
+ int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
343
+ input_shape = ov::PartialShape{1, 1, 1, len};
344
+
345
+ } else if (is_output_idx(input, op)) {
346
+ // output index
347
+ input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
348
+
349
+ } else if (is_inp_mask(input, op)) {
350
+ // mask
351
+ if (m_is_static) {
352
+ input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
353
+ } else if (m_is_stateful) {
354
+ input_shape = ov::PartialShape{1, 1, -1, -1};
355
+ } else {
356
+ input_shape = ov::PartialShape{-1, 1, -1, -1};
357
+ }
358
+
359
+ } else if (is_kvcache(input, op)) {
360
+ // kvcache
361
+ input_shape = ov::PartialShape{get_shape(input)};
362
+ if (!m_is_static) {
363
+ // do not fix ctx size to make llama-bench work across test params
364
+ input_shape[2] = -1;
365
+ }
366
+ if (is_stateful()) {
367
+ // Convert stateless KV cache layout [1, 1, seq, n_heads_kv * head_size]
368
+ // to stateful layout [1, seq, n_heads_kv, head_size].
369
+ assert(input_shape.size() == 4 && input_shape[0] == 1 && input_shape[1] == 1 &&
370
+ input_shape[2].is_dynamic() &&
371
+ input_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size));
372
+ input_shape = {input_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv,
373
+ m_model_params.head_size};
374
+ }
375
+
376
+ } else if (is_kv_idx(input, op)) {
377
+ // kv update index
378
+ int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
379
+ input_shape = ov::PartialShape{1, 1, 1, len};
380
+
381
+ } else {
382
+ input_shape = ov::PartialShape{get_shape(input)};
383
+ }
384
+ return input_shape;
385
+ }
386
+
387
+ void GgmlOvDecoder::add_extra_inputs() {
388
+ // Extra inputs:
389
+ // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
390
+ // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
391
+ // 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch
392
+
393
+ auto create_1d_input = [this](const std::string & name, int64_t value) {
394
+ if (m_is_static) {
395
+ auto constant =
396
+ std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
397
+ constant->set_friendly_name(name);
398
+ m_model_extra_inputs[name] = constant;
399
+ } else {
400
+ auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
401
+ param_node->set_friendly_name(name);
402
+ param_node->output(0).get_tensor().set_names({name});
403
+ m_model_extra_inputs[name] = param_node;
404
+
405
+ auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
406
+ *tensor->data<int64_t>() = value;
407
+ m_model_extra_input_values[name] = tensor;
408
+ }
409
+ };
410
+
411
+ create_1d_input("attention_size", m_compute_params.attention_size);
412
+ if (m_compute_params.attention_size_swa != -1) {
413
+ create_1d_input("attention_size_swa", m_compute_params.attention_size_swa);
414
+ }
415
+ create_1d_input("n_seq_active", m_compute_params.n_seq_active);
416
+ create_1d_input("seq_active_start", m_compute_params.seq_active_start);
417
+ create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active);
418
+ create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq);
419
+ // create_1d_input("token_len", m_token_len_per_seq * m_n_seq_active);
420
+ }
421
+
422
+ bool GgmlOvDecoder::node_is_used_as_src(const int node_idx) {
423
+ ggml_tensor * node = m_cgraph->nodes[node_idx];
424
+ for (int i = node_idx; i < m_cgraph->n_nodes; i++) {
425
+ ggml_tensor * other_node = m_cgraph->nodes[i];
426
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
427
+ if (other_node->src[j] == node) {
428
+ return true;
429
+ }
430
+ }
431
+ }
432
+ return false;
433
+ }
434
+
435
+ void GgmlOvDecoder::compute_model_inputs() {
436
+ m_model_inputs.clear();
437
+ m_inputs.clear();
438
+ for (int i = 0; i < m_cgraph->n_nodes; i++) {
439
+ ggml_tensor * node = m_cgraph->nodes[i];
440
+ // the node op is NONE means this node maybe as input of later nodes, we should add it to model inputs for this node.
441
+ if (node->op == GGML_OP_NONE && node_is_used_as_src(i)) {
442
+ std::string node_name(node->name);
443
+ if (m_model_weights.find(node_name) == m_model_weights.end()) {
444
+ m_inputs[node_name] = node;
445
+ auto param_node =
446
+ std::make_shared<ov::op::v0::Parameter>(get_ov_type(node), get_graph_input_shape(node, nullptr));
447
+ param_node->set_friendly_name(node_name);
448
+ param_node->output(0).get_tensor().set_names({node_name});
449
+ m_model_inputs[node_name] = param_node;
450
+ }
451
+ continue;
452
+ }
453
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
454
+ auto * src = node->src[i];
455
+ if (src == nullptr) {
456
+ continue;
457
+ }
458
+ std::string src_name = std::string(src->name);
459
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
460
+ src_name = get_graph_input_ov_name(src, node);
461
+ }
462
+ if (m_model_weights.find(src_name) != m_model_weights.end()) {
463
+ continue;
464
+ }
465
+
466
+ bool is_intermediate_node = false;
467
+ for (const auto & node_info : m_node_info_list) {
468
+ if (node_info.node == src) {
469
+ is_intermediate_node = true;
470
+ break;
471
+ }
472
+ }
473
+ if (is_intermediate_node) {
474
+ continue;
475
+ }
476
+ if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
477
+ continue;
478
+ }
479
+
480
+ m_inputs[src_name] = src;
481
+
482
+ ggml_backend_buffer * buffer = src->buffer;
483
+ // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
484
+ if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
485
+ if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
486
+ it == m_model_params.kv_names.end()) {
487
+ m_model_params.kv_names.push_back(src_name);
488
+ }
489
+ }
490
+ ov::PartialShape param_shape = get_graph_input_shape(node, src);
491
+ auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
492
+ param_node->set_friendly_name(src_name);
493
+ param_node->output(0).get_tensor().set_names({src_name});
494
+ m_model_inputs[src_name] = param_node;
495
+ }
496
+ }
497
+ }
498
+
499
+ void GgmlOvDecoder::compute_model_outputs() {
500
+ m_model_outputs.clear();
501
+ m_model_output_names.clear();
502
+ for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
503
+ auto * cur_node = m_cgraph->nodes[node_n];
504
+ // if the node op is NONE means this node is not used at all, we can skip it directly without adding to model outputs.
505
+ if (cur_node->op == GGML_OP_NONE) {
506
+ continue;
507
+ }
508
+ auto cur_node_use_count = m_cgraph->use_counts[ggml_hash_find(&m_cgraph->visited_hash_set, cur_node)];
509
+ if (cur_node_use_count == 0) {
510
+ // The output of SET_ROWS is the view_src tensor, which is updated in place. We should use the view_src name as the output name to make sure it can be correctly matched with the later ops that use the view_src.
511
+ if (cur_node != nullptr && cur_node->op == GGML_OP_SET_ROWS) {
512
+ cur_node = cur_node->view_src;
513
+ }
514
+ } else {
515
+ int input_use_count = 0;
516
+ for (int i = 0; i < m_cgraph->n_nodes; i++) {
517
+ ggml_tensor * node = m_cgraph->nodes[i];
518
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
519
+ if (node->src[j] != NULL && node->src[j] == cur_node) {
520
+ input_use_count++;
521
+ }
522
+ }
523
+ }
524
+ if (input_use_count == cur_node_use_count) {
525
+ cur_node = nullptr;
526
+ }
527
+ }
528
+ if (cur_node != nullptr) {
529
+ std::string node_output_name(cur_node->name);
530
+ m_model_outputs[node_output_name] = cur_node;
531
+ m_model_output_names.push_back(node_output_name);
532
+ }
533
+ }
534
+ }
535
+
536
+ const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const {
537
+ if (tensor == nullptr) {
538
+ return nullptr;
539
+ }
540
+ for (int i = 0; i < m_cgraph->n_nodes; i++) {
541
+ const auto * node = m_cgraph->nodes[i];
542
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
543
+ if (node->src[j] == tensor) {
544
+ return node;
545
+ }
546
+ }
547
+ }
548
+ return nullptr;
549
+ }
550
+
551
+ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const {
552
+ for (int i = 0; i < m_cgraph->n_nodes; i++) {
553
+ const auto * node = m_cgraph->nodes[i];
554
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
555
+ const auto * src = node->src[j];
556
+ if (src == nullptr) {
557
+ break;
558
+ }
559
+ if (std::string(src->name) == name) {
560
+ return src;
561
+ }
562
+ }
563
+ }
564
+ return nullptr;
565
+ }
566
+
567
+ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
568
+ std::map<std::string, std::string> kv_param_res_names;
569
+ for (const auto & name : m_model_params.kv_names) {
570
+ kv_param_res_names[name] = name;
571
+ }
572
+ return kv_param_res_names;
573
+ }
574
+
575
+ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
576
+ static std::mutex weights_mutex;
577
+ std::lock_guard<std::mutex> lock(weights_mutex);
578
+
579
+ std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
580
+ auto * nodes = cgraph->nodes;
581
+ auto n_nodes = cgraph->n_nodes;
582
+ for (int node_i = 0; node_i < n_nodes; node_i++) {
583
+ auto * node = nodes[node_i];
584
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
585
+ auto * src = node->src[i];
586
+ if (src == nullptr) {
587
+ continue;
588
+ }
589
+
590
+ std::string src_name(src->name);
591
+ if (is_rope_freqs_weight(src, node)) {
592
+ src_name = "rope_freqs.weight";
593
+ }
594
+ if (!src->view_src) {
595
+ ggml_backend_buffer * buffer = src->buffer;
596
+ if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
597
+ if (model_weights.find(src_name) == model_weights.end()) {
598
+ auto weight_node = create_weight_node(src, naive);
599
+ weight_node->set_friendly_name(src_name);
600
+ model_weights[src_name] = weight_node;
601
+ }
602
+ }
603
+ }
604
+ }
605
+ }
606
+ return model_weights;
607
+ }
608
+
609
+ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
610
+ const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
611
+
612
+ // Check if we have a pre-built constant from the OpenVINO backend buffer
613
+ // This is set during ggml_backend_openvino_buffer_set_tensor
614
+ if (tensor->extra) {
615
+ OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) +
616
+ " Possibly this is a cpu backend repacked quantized weights");
617
+ // Cast to our extra base type and check the type
618
+ auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
619
+
620
+ if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
621
+ // F16/F32/BF16 weight with shared-memory constant
622
+ auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
623
+ if (weight_extra->weight_node) {
624
+ // GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
625
+ return weight_extra->weight_node;
626
+ }
627
+ } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
628
+ // Quantized weight with pre-extracted data
629
+ auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
630
+ if (quant_extra->weight_node) {
631
+ // GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
632
+ return quant_extra->weight_node;
633
+ }
634
+ }
635
+ }
636
+
637
+ // There are three cases where we need to create a new weight node:
638
+ // 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
639
+ // 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
640
+ // 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node
641
+
642
+ // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
643
+ static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
644
+ GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
645
+ GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
646
+ if (weight_types.find(tensor->type) == weight_types.end()) {
647
+ throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
648
+ ggml_type_name(tensor->type));
649
+ }
650
+
651
+ OvWeight ov_weight;
652
+ if (ggml_is_quantized(tensor->type)) {
653
+ auto use_bias = naive;
654
+ if (is_ov_buffer) {
655
+ // For quantized weights, copy raw data to a temp buffer first because
656
+ // process_weight_tensor reads from data and writes extracted results
657
+ // (weights/scales/zp) to output_base_ptr — they would overlap if both
658
+ // point to tensor->data.
659
+ size_t raw_size = ggml_nbytes(tensor);
660
+ std::vector<uint8_t> tmp(raw_size);
661
+ memcpy(tmp.data(), tensor->data, raw_size);
662
+ ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
663
+ } else {
664
+ ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
665
+ }
666
+ } else {
667
+ // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
668
+ // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
669
+ ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
670
+ }
671
+
672
+ ov_weight.weight_node->set_friendly_name(tensor->name);
673
+ if (!is_ov_buffer) {
674
+ return ov_weight.weight_node;
675
+ }
676
+
677
+ ggml_openvino_extra_base * extra;
678
+ if (ov_weight.is_quantized()) {
679
+ extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
680
+ std::move(ov_weight.zp), ov_weight.weight_node);
681
+ } else {
682
+ extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
683
+ }
684
+ ggml_openvino_buffer_register_extra(tensor, extra);
685
+
686
+ return ov_weight.weight_node;
687
+ }
688
+
689
+ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
690
+ std::ofstream file(filename);
691
+ if (!file.is_open()) {
692
+ std::cerr << "Failed to open file" << std::endl;
693
+ return;
694
+ }
695
+
696
+ file << "=== GRAPH ===\n";
697
+
698
+ // clang-format off
699
+ file << "n_nodes = " << cgraph->n_nodes << "\n";
700
+ file << " " << std::setw(3) << "nodes"
701
+ << std::setw(15) << "shape"
702
+ << std::setw(20) << "op"
703
+ << std::setw(20) << "name"
704
+ << std::setw(3) << " "
705
+ << std::setw(62) << "stride"
706
+ << std::setw(20) << "buffer_type"
707
+ << "\n";
708
+ for (int i = 0; i < cgraph->n_nodes; i++) {
709
+ ggml_tensor * node = cgraph->nodes[i];
710
+
711
+ // Get buffer type name
712
+ const char * buf_name = "none";
713
+ ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer;
714
+ if (buf) {
715
+ buf_name = ggml_backend_buffer_name(buf);
716
+ }
717
+
718
+ file << " - " << std::setw(3) << i << ": [ "
719
+ << std::setw(5) << node->ne[0] << ", "
720
+ << std::setw(5) << node->ne[1] << ", "
721
+ << std::setw(5) << node->ne[2] << ", "
722
+ << std::setw(5) << node->ne[3] << "] "
723
+ << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " "
724
+ << std::left << std::setw(45) << node->name << std::right
725
+ << std::setw(2) << "[ "
726
+ << std::setw(0) << node->nb[0] << ", "
727
+ << std::setw(5) << node->nb[1] << ", "
728
+ << std::setw(5) << node->nb[2] << ", "
729
+ << std::setw(5) << node->nb[3] << "] "
730
+ << std::right << std::setw(15) << buf_name << std::right
731
+ << "\n";
732
+
733
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
734
+ if (auto* src = node->src[i]) {
735
+ // Get buffer type name for source
736
+ const char * src_buf_name = "none";
737
+ ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer;
738
+ if (src_buf) {
739
+ src_buf_name = ggml_backend_buffer_name(src_buf);
740
+ }
741
+
742
+ file << std::setw(10) << " [ "
743
+ << std::setw(5) << src->ne[0] << ", "
744
+ << std::setw(5) << src->ne[1] << ", "
745
+ << std::setw(5) << src->ne[2] << ", "
746
+ << std::setw(5) << src->ne[3] << "] "
747
+ << std::setw(12)
748
+ << i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right;
749
+ file << std::left << std::setw(30) << src->name << std::right
750
+ << std::setw(16) << "[ "
751
+ << std::setw(0) << src->nb[0] << ", "
752
+ << std::setw(5) << src->nb[1] << ", "
753
+ << std::setw(5) << src->nb[2] << ", "
754
+ << std::setw(5) << src->nb[3] << "] "
755
+ << std::right << std::setw(15) << src_buf_name << std::right
756
+ << "\n";
757
+ }
758
+ }
759
+ }
760
+
761
+ file << "n_leafs = " << cgraph->n_leafs << "\n";
762
+ for (int i = 0; i < cgraph->n_leafs; i++) {
763
+ ggml_tensor * node = cgraph->leafs[i];
764
+
765
+ // Get buffer type name for leaf
766
+ const char * leaf_buf_name = "none";
767
+ ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer;
768
+ if (leaf_buf) {
769
+ leaf_buf_name = ggml_backend_buffer_name(leaf_buf);
770
+ }
771
+
772
+ file << " - " << std::setw(3) << i << ": [ "
773
+ << std::setw(5) << node->ne[0] << ", "
774
+ << std::setw(5) << node->ne[1] << "] "
775
+ << std::setw(8) << ggml_op_name(node->op) << " "
776
+ << std::setw(16) << ggml_get_name(node)
777
+ << std::setw(20) << leaf_buf_name << "\n";
778
+ }
779
+ // clang-format on
780
+ file << "========================================\n";
781
+
782
+ file.close();
783
+ }
784
+
785
+ void print_tensor_address_map(const ggml_cgraph * cgraph) {
786
+ std::map<void *, std::vector<std::string>> address_map;
787
+ for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
788
+ auto * node = cgraph->nodes[node_n];
789
+ if (node->data) {
790
+ auto it = address_map.find(node->data);
791
+ if (it == address_map.end()) {
792
+ address_map[node->data] = std::vector<std::string>();
793
+ }
794
+ address_map[node->data].push_back(node->name);
795
+ }
796
+ }
797
+ for (const auto & pair : address_map) {
798
+ std::cout << "Address: " << pair.first << std::endl;
799
+ for (const auto & name : pair.second) {
800
+ std::cout << name << " ; ";
801
+ }
802
+ std::cout << std::endl << std::endl;
803
+ }
804
+ }
805
+
806
+ ov::Shape GgmlOvDecoder::get_shape(const ggml_tensor * tensor) {
807
+ std::vector<size_t> shape;
808
+ for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
809
+ shape.push_back(static_cast<size_t>(tensor->ne[i]));
810
+ }
811
+ return shape;
812
+ }
813
+
814
+ std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor * tensor) {
815
+ std::vector<size_t> stride;
816
+ for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
817
+ stride.push_back(static_cast<size_t>(tensor->nb[i]));
818
+ }
819
+ return stride;
820
+ }
821
+
822
+ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) {
823
+ switch (tensor->type) {
824
+ case GGML_TYPE_F64:
825
+ return ov::element::f64;
826
+ case GGML_TYPE_F32:
827
+ return ov::element::f32;
828
+ case GGML_TYPE_F16:
829
+ return ov::element::f16;
830
+ case GGML_TYPE_BF16:
831
+ return ov::element::bf16;
832
+ case GGML_TYPE_I8:
833
+ return ov::element::i8;
834
+ case GGML_TYPE_I16:
835
+ return ov::element::i16;
836
+ case GGML_TYPE_I32:
837
+ return ov::element::i32;
838
+ case GGML_TYPE_I64:
839
+ return ov::element::i64;
840
+ default:
841
+ return ov::element::dynamic;
842
+ }
843
+ }
844
+
845
+ ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string & name) const {
846
+ return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name)));
847
+ }
848
+
849
+ std::vector<size_t> GgmlOvDecoder::get_input_stride(int node_idx, const std::string & name) const {
850
+ return get_stride(m_node_info_list[node_idx].node_inputs.at(name));
851
+ }
852
+
853
+ ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const {
854
+ return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name));
855
+ }
856
+
857
+ size_t GgmlOvDecoder::get_input_size() const {
858
+ return m_model_inputs.size();
859
+ }
860
+
861
+ size_t GgmlOvDecoder::get_input_size(int node_idx) const {
862
+ return m_node_info_list[node_idx].node_inputs_names.size();
863
+ }
864
+
865
+ std::vector<std::string> GgmlOvDecoder::get_input_names(int node_idx) const {
866
+ return m_node_info_list[node_idx].node_inputs_names;
867
+ }
868
+
869
+ ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const {
870
+ auto * ggml_tensor = m_node_info_list[node_idx].node_output;
871
+ return ov::PartialShape(get_shape(ggml_tensor));
872
+ }
873
+
874
+ ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const {
875
+ return get_ov_type(m_node_info_list[node_idx].node);
876
+ }
877
+
878
+ std::vector<std::string> GgmlOvDecoder::get_output_names(int node_idx) const {
879
+ return {m_node_info_list[node_idx].node_output_name};
880
+ }
881
+
882
+ const std::string & GgmlOvDecoder::get_op_name() const {
883
+ static const std::string unknown_name = "UNKNOWN_OP_NAME";
884
+ return unknown_name;
885
+ }
886
+
887
+ const std::string & GgmlOvDecoder::get_op_name(int node_idx) const {
888
+ return m_node_info_list[node_idx].node_name;
889
+ }
890
+
891
+ int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & name) const {
892
+ return m_node_info_list[node_idx].node_inputs.at(name)->op_params;
893
+ }
894
+
895
+ int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const {
896
+ return m_node_info_list[node_idx].node->op_params;
897
+ }
898
+
899
+ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const {
900
+ for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) {
901
+ if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) {
902
+ continue;
903
+ }
904
+ node_visitor(std::make_shared<GgmlOvDecoder>(*this), node_idx);
905
+ }
906
+ }
907
+
908
+ std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
909
+ static const std::map<ggml_op, std::string> ops = {
910
+ {GGML_OP_NONE, "GGML_OP_NONE" },
911
+ {GGML_OP_ACC, "GGML_OP_ACC" },
912
+ {GGML_OP_ADD, "GGML_OP_ADD" },
913
+ {GGML_OP_ADD1, "GGML_OP_ADD1" },
914
+ {GGML_OP_CONT, "GGML_OP_CONT" },
915
+ {GGML_OP_DIV, "GGML_OP_DIV" },
916
+ {GGML_OP_DUP, "GGML_OP_DUP" },
917
+ {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },
918
+ {GGML_OP_MUL, "GGML_OP_MUL" },
919
+ {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" },
920
+ {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" },
921
+ {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" },
922
+ {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" },
923
+ {GGML_OP_ROPE, "GGML_OP_ROPE" },
924
+ {GGML_OP_SCALE, "GGML_OP_SCALE" },
925
+ {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" },
926
+ {GGML_OP_SUB, "GGML_OP_SUB" },
927
+ {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" },
928
+ {GGML_OP_VIEW, "GGML_OP_VIEW" },
929
+ {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" },
930
+ {GGML_OP_CPY, "GGML_OP_CPY" },
931
+ {GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT"},
932
+ };
933
+ static const std::map<ggml_unary_op, std::string> unary_ops = {
934
+ {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" },
935
+ {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" },
936
+ {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" },
937
+ {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" },
938
+ {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" },
939
+ {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" },
940
+ {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" },
941
+ {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" },
942
+ {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" },
943
+ {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" },
944
+ {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" },
945
+ {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" },
946
+ {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
947
+ {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" },
948
+ {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" }
949
+ };
950
+ static const std::map<ggml_glu_op, std::string> glu_ops = {
951
+ {GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
952
+ {GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" },
953
+ {GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" }
954
+ };
955
+
956
+ switch (node->op) {
957
+ case GGML_OP_UNARY:
958
+ return unary_ops.at(ggml_get_unary_op(node));
959
+ case GGML_OP_GLU:
960
+ return glu_ops.at(ggml_get_glu_op(node));
961
+ default:
962
+ return ops.at(node->op);
963
+ }
964
+ static const std::string unknown_op = "UNKNOWN_GGML_OP";
965
+ return unknown_op;
966
+ }
967
+
968
+ const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
969
+ return m_node_info_list[node_idx].node_op_type;
970
+ }
971
+
972
+ const std::string & GgmlOvDecoder::get_op_type() const {
973
+ static const std::string unknown_op = "UNKNOWN_GGML_OP";
974
+ return unknown_op;
975
+ }