whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (610) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +99 -2
  4. data/ext/extconf.rb +1 -0
  5. data/ext/ruby_whisper.c +20 -4
  6. data/ext/ruby_whisper.h +30 -2
  7. data/ext/ruby_whisper_context.c +216 -124
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +0 -1
  10. data/ext/ruby_whisper_params.c +0 -1
  11. data/ext/ruby_whisper_segment.c +0 -1
  12. data/ext/ruby_whisper_token.c +29 -9
  13. data/ext/ruby_whisper_transcribe.cpp +4 -1
  14. data/ext/ruby_whisper_vad_context.c +48 -1
  15. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +0 -1
  18. data/ext/ruby_whisper_vad_segments.c +0 -1
  19. data/ext/sources/CMakeLists.txt +1 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  22. data/ext/sources/examples/bench/bench.cpp +23 -18
  23. data/ext/sources/examples/cli/cli.cpp +8 -0
  24. data/ext/sources/examples/common-ggml.cpp +2 -0
  25. data/ext/sources/examples/miniaudio.h +4507 -2131
  26. data/ext/sources/examples/server/server.cpp +18 -4
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
  31. data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
  33. data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
  34. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  35. data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
  36. data/ext/sources/examples/talk-llama/llama-context.h +27 -28
  37. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
  38. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
  40. data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
  41. data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
  42. data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
  43. data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
  44. data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
  45. data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
  49. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  50. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  51. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
  52. data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
  53. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
  54. data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
  55. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
  56. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  57. data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
  58. data/ext/sources/examples/talk-llama/llama-model.h +72 -19
  59. data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
  60. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
  61. data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
  62. data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
  63. data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
  64. data/ext/sources/examples/talk-llama/llama.cpp +76 -22
  65. data/ext/sources/examples/talk-llama/llama.h +63 -30
  66. data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
  67. data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
  68. data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
  69. data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
  70. data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
  71. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
  72. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
  73. data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
  74. data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
  75. data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
  76. data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
  77. data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
  78. data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
  79. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
  80. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
  81. data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
  82. data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
  83. data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
  84. data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
  85. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
  86. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  87. data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
  88. data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
  89. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
  90. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
  91. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  92. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
  108. data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
  109. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
  110. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
  111. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
  112. data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
  113. data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
  114. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
  116. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  117. data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
  118. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
  119. data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
  120. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
  121. data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
  122. data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
  123. data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
  124. data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
  125. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
  126. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
  127. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
  128. data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
  129. data/ext/sources/examples/talk-llama/models/models.h +181 -46
  130. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
  131. data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
  132. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
  133. data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
  134. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
  135. data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
  136. data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
  137. data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
  138. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
  139. data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
  140. data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
  141. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  142. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
  143. data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
  144. data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
  145. data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
  146. data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
  147. data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
  148. data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
  149. data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
  150. data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
  151. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
  152. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
  153. data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
  154. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  155. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  156. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
  157. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
  158. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
  159. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
  160. data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
  161. data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
  162. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
  163. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
  164. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
  165. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
  166. data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
  167. data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
  168. data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
  169. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
  170. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  171. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
  172. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
  173. data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
  174. data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
  175. data/ext/sources/ggml/CMakeLists.txt +9 -3
  176. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  177. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  179. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  180. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  181. data/ext/sources/ggml/include/ggml-rpc.h +6 -1
  182. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  183. data/ext/sources/ggml/include/ggml.h +56 -9
  184. data/ext/sources/ggml/src/CMakeLists.txt +3 -0
  185. data/ext/sources/ggml/src/ggml-alloc.c +4 -9
  186. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  187. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  188. data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
  189. data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  191. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
  192. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  193. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  194. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
  195. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
  196. data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
  197. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
  198. data/ext/sources/ggml/src/ggml-common.h +11 -0
  199. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
  200. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
  201. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  202. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  203. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
  204. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
  205. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  208. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  209. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
  210. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
  211. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  212. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  213. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
  214. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
  215. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
  216. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  217. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
  218. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
  219. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
  220. data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
  221. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  222. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  223. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
  224. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  225. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  226. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
  227. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  228. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
  229. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
  231. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
  232. data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
  233. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
  235. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
  236. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
  237. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
  238. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
  239. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  241. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
  242. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  243. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  244. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
  245. data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
  246. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
  247. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  248. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
  249. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
  250. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
  251. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  252. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
  253. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
  254. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  255. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  256. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
  257. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  258. data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
  259. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  260. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  261. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
  262. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  269. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
  270. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
  271. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
  272. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
  274. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
  278. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  297. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  317. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  321. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
  322. data/ext/sources/ggml/src/ggml-impl.h +62 -0
  323. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  324. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  325. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  326. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  327. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
  328. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
  329. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
  330. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
  335. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
  336. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  367. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  368. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  369. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  370. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  371. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  372. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  373. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  374. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  375. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  376. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  377. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  378. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  379. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  380. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  381. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  382. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  383. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  384. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  385. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  386. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  387. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  388. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  389. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  390. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  391. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  392. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  393. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  394. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  395. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  396. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  397. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  398. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  399. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  400. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  401. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  402. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  403. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  404. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  405. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  406. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  407. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  412. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  413. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  414. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  415. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  416. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
  417. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  418. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  420. data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
  421. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
  422. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  423. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  424. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  425. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
  426. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  427. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  428. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  429. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  430. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  431. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  432. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  433. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  434. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  435. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
  436. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  437. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  438. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  439. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  440. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  441. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  442. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  443. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  444. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  445. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  446. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  447. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  448. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  449. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  450. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  451. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  452. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  453. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  454. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  455. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  456. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  457. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  458. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  459. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  460. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  461. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  462. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  463. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  464. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  465. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  466. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  467. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  468. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  469. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  470. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  471. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  472. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  473. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  474. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  475. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  476. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  477. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  478. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  479. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  480. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  481. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  482. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  483. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  484. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  485. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  486. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  487. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  488. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
  489. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  490. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  491. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  492. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  493. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  494. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  495. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  496. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  497. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  498. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  499. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  500. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  501. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  502. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  503. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  504. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  505. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  506. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  507. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  508. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  509. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  510. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  511. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  512. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  513. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  514. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  515. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  516. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  517. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  518. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  519. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  520. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  521. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  522. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  523. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  524. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  525. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  526. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  527. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  528. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  529. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  530. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  531. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  532. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
  533. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
  560. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
  561. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
  562. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  563. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  564. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  565. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  566. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  567. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  568. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
  569. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  570. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
  571. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
  572. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  573. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
  574. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  584. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  585. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  586. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
  587. data/ext/sources/ggml/src/ggml.c +167 -33
  588. data/ext/sources/ggml/src/gguf.cpp +229 -44
  589. data/ext/sources/src/whisper.cpp +6 -28
  590. data/sig/whisper.rbs +43 -2
  591. data/test/test_context_params.rb +82 -0
  592. data/test/test_token.rb +11 -0
  593. data/test/test_vad_context.rb +58 -8
  594. data/test/test_whisper.rb +20 -0
  595. data/whispercpp.gemspec +1 -1
  596. metadata +240 -28
  597. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  598. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  599. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  600. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  601. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  602. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  603. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  604. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  605. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  606. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  607. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  608. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  609. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  610. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
@@ -0,0 +1,975 @@
1
+ #include "ggml-decoder.h"
2
+
3
+ #include "ggml-backend-impl.h"
4
+ #include "ggml-backend.h"
5
+ #include "ggml-openvino-extra.h"
6
+ #include "ggml-openvino.h"
7
+ #include "ggml-quants.h"
8
+
9
+ #include <ggml-impl.h>
10
+ #include <ggml.h>
11
+
12
+ #include <algorithm>
13
+ #include <cassert>
14
+ #include <cstddef>
15
+ #include <cstdint>
16
+ #include <cstdlib>
17
+ #include <execution>
18
+ #include <fstream>
19
+ #include <iomanip>
20
+ #include <map>
21
+ #include <memory>
22
+ #include <mutex>
23
+ #include <openvino/core/dimension.hpp>
24
+ #include <openvino/core/except.hpp>
25
+ #include <openvino/core/node.hpp>
26
+ #include <openvino/core/partial_shape.hpp>
27
+ #include <openvino/core/type/bfloat16.hpp>
28
+ #include <openvino/core/type/element_type.hpp>
29
+ #include <openvino/core/type/float16.hpp>
30
+ #include <openvino/op/constant.hpp>
31
+ #include <openvino/op/convert.hpp>
32
+ #include <openvino/op/parameter.hpp>
33
+ #include <openvino/runtime/tensor.hpp>
34
+ #include <optional>
35
+ #include <ostream>
36
+ #include <set>
37
+ #include <stdexcept>
38
+ #include <string>
39
+ #include <unordered_map>
40
+ #include <vector>
41
+
42
+ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
43
+ ModelParams & model_params,
44
+ ComputeParams & compute_params,
45
+ std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
46
+ bool is_static,
47
+ bool is_stateful,
48
+ bool is_prefill,
49
+ int prefill_chunk_size) :
50
+ m_is_static(is_static),
51
+ m_is_stateful(is_stateful),
52
+ m_is_prefill(is_prefill),
53
+ m_naive(false),
54
+ m_prefill_chunk_size(prefill_chunk_size),
55
+ m_cgraph(cgraph),
56
+ m_model_weights(model_weights),
57
+ m_model_params(model_params),
58
+ m_compute_params(compute_params) {
59
+ if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") {
60
+ #ifdef _WIN32
61
+ _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", "");
62
+ #else
63
+ unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS");
64
+ #endif
65
+ print_tensor_address_map(cgraph);
66
+ }
67
+
68
+ validate_cgraph();
69
+
70
+ set_input_output();
71
+ compute_model_inputs();
72
+ compute_model_outputs();
73
+
74
+ for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
75
+ m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
76
+ m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
77
+ }
78
+
79
+ add_extra_inputs();
80
+ }
81
+
82
+ void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
83
+ m_cgraph = cgraph;
84
+ m_model_inputs.clear();
85
+ m_model_outputs.clear();
86
+ m_node_info_list.clear();
87
+ set_input_output();
88
+ compute_model_inputs();
89
+ compute_model_outputs();
90
+ }
91
+
92
+ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
93
+ m_cgraph = cgraph;
94
+ m_model_weights = model_weights;
95
+ m_naive = true;
96
+ set_input_output();
97
+ compute_model_inputs();
98
+ compute_model_outputs();
99
+ for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
100
+ m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
101
+ m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
102
+ }
103
+ }
104
+
105
+ void GgmlOvDecoder::set_input_output() {
106
+ for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
107
+ auto node = m_cgraph->nodes[node_n];
108
+
109
+ NodeInfo current_node_info;
110
+ auto node_name = std::string(node->name);
111
+ auto node_output_name = node_name;
112
+ auto * node_output = node;
113
+ if (node->op == GGML_OP_SET_ROWS) {
114
+ // SET_ROWS updates the tensor in place. For later ov op that uses the
115
+ // the view_src of SET_ROWS, we need to make sure they get the updated tensor
116
+ // by putting the view_src name in the tensor_map in
117
+ // <openvino>/src/frontends/ggml/src/translate_session.cpp
118
+ node_output_name = std::string(node->view_src->name);
119
+ node_output = node->view_src;
120
+ }
121
+
122
+ current_node_info.node = node;
123
+ current_node_info.node_name = node_name;
124
+ current_node_info.node_output = node_output;
125
+ current_node_info.node_output_name = node_output_name;
126
+ current_node_info.node_op_case = 0;
127
+ current_node_info.data_addr = node->data;
128
+
129
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
130
+ auto * src = node->src[i];
131
+ if (src == nullptr) {
132
+ continue;
133
+ }
134
+ auto src_name = std::string(src->name);
135
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
136
+ src_name = get_graph_input_ov_name(src, node);
137
+ }
138
+ current_node_info.node_inputs[src_name] = src;
139
+ current_node_info.node_inputs_names.push_back(src_name);
140
+ }
141
+
142
+ m_node_info_list.push_back(current_node_info);
143
+ }
144
+ }
145
+
146
+ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
147
+ int op_case = 0;
148
+ switch (node->op) {
149
+ case GGML_OP_RESHAPE: {
150
+ auto * src = node->src[0];
151
+ if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) {
152
+ op_case = 4;
153
+ } else if (node->ne[0] * node->ne[1] == src->ne[0]) {
154
+ op_case = 1;
155
+ } else if (src->ne[0] * src->ne[1] == node->ne[0]) {
156
+ op_case = 2;
157
+ if (src->ne[2] * src->ne[3] == node->ne[1]) {
158
+ op_case = 5;
159
+ }
160
+ } else if (src->ne[0] * src->ne[1] == node->ne[1]) {
161
+ op_case = 3;
162
+ } else if (src->ne[1] * src->ne[2] == node->ne[1]) {
163
+ op_case = 6;
164
+ }
165
+ break;
166
+ }
167
+ case GGML_OP_CONT: {
168
+ if (node->src[0]->op == GGML_OP_PERMUTE) {
169
+ op_case = 1;
170
+ } else if (node->src[0]->op == GGML_OP_TRANSPOSE) {
171
+ op_case = 2;
172
+ } else if (node->src[0]->op == GGML_OP_VIEW) {
173
+ op_case = 3;
174
+ }
175
+ break;
176
+ }
177
+ case GGML_OP_PERMUTE: {
178
+ if (node->src[0]->op != GGML_OP_VIEW) {
179
+ op_case = 1;
180
+ } else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
181
+ // kv cache tensor
182
+ std::string src_name(node->view_src->name);
183
+ int layer = extract_layer_from_name(src_name);
184
+ if (!is_swa_layer(layer)) {
185
+ op_case = 2;
186
+ } else {
187
+ op_case = 3;
188
+ }
189
+ } else {
190
+ // rope'ed query tensor
191
+ op_case = 4;
192
+ }
193
+ break;
194
+ }
195
+ case GGML_OP_MUL_MAT: {
196
+ if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) {
197
+ op_case = 2;
198
+ } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
199
+ op_case = 3;
200
+ }
201
+ break;
202
+ }
203
+ case GGML_OP_GET_ROWS: {
204
+ if (node->src[1]->op == GGML_OP_VIEW) {
205
+ op_case = 2;
206
+ }
207
+ break;
208
+ }
209
+ case GGML_OP_ROPE: {
210
+ if (node->src[0]->op == GGML_OP_VIEW) {
211
+ op_case = 2;
212
+ }
213
+ break;
214
+ }
215
+ case GGML_OP_VIEW: {
216
+ if (node->src[0]->op == GGML_OP_VIEW) {
217
+ auto * src = node->src[0];
218
+ if (ggml_nelements(node) != ggml_nelements(src)) {
219
+ throw std::runtime_error("Unsupported VIEW case");
220
+ }
221
+ op_case = 2;
222
+ }
223
+ {
224
+ auto * src = node->src[0];
225
+ if ((ggml_nelements(node) != ggml_nelements(src)) && m_naive) {
226
+ // Compare each dimension of node and src, if only one dimension differs then op_case=3
227
+ int diff_count = 0;
228
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
229
+ if (node->ne[i] != src->ne[i]) {
230
+ diff_count++;
231
+ }
232
+ }
233
+ if (diff_count == 1) {
234
+ op_case = 3;
235
+ }
236
+ }
237
+ }
238
+ break;
239
+ }
240
+ default:
241
+ break;
242
+ }
243
+ return op_case;
244
+ }
245
+
246
+ int extract_layer_from_name(const std::string & name) {
247
+ size_t pos1 = name.find("_l");
248
+ assert(pos1 != std::string::npos);
249
+ pos1 += 2;
250
+ size_t pos2 = name.find(' ', pos1);
251
+ if (pos2 == std::string::npos) {
252
+ pos2 = name.length();
253
+ }
254
+ std::string layer_str = name.substr(pos1, pos2 - pos1);
255
+ int layer = std::stoi(layer_str);
256
+ return layer;
257
+ }
258
+
259
+ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) {
260
+ ModelParams model_params;
261
+ ComputeParams compute_params;
262
+ for (int i = 0; i < cgraph->n_nodes; i++) {
263
+ auto * node = cgraph->nodes[i];
264
+ std::string name = std::string(node->name);
265
+ if (node->op == GGML_OP_FLASH_ATTN_EXT) {
266
+ model_params.n_heads = node->src[0]->ne[2];
267
+ model_params.n_heads_kv = node->src[1]->ne[2];
268
+ model_params.head_size = node->src[0]->ne[0];
269
+ compute_params.input_len = node->src[0]->ne[1];
270
+
271
+ auto * cache_k_perm = node->src[1];
272
+ if (cache_k_perm->op == GGML_OP_CPY) {
273
+ cache_k_perm = cache_k_perm->src[0];
274
+ }
275
+ assert(cache_k_perm->op == GGML_OP_PERMUTE);
276
+ auto * cache_k_view = cache_k_perm->src[0];
277
+ assert(cache_k_view->op == GGML_OP_VIEW);
278
+
279
+ auto * cache_k = cache_k_view->src[0];
280
+ int layer = extract_layer_from_name(cache_k->name);
281
+ auto * mask = node->src[3];
282
+ std::string mask_name(mask->name);
283
+
284
+ model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
285
+ if (mask_name.find("swa") != std::string::npos) {
286
+ model_params.swa_layers.push_back(layer);
287
+ model_params.ctx_per_seq_swa = cache_k->ne[1];
288
+ } else {
289
+ model_params.ctx_per_seq = cache_k->ne[1];
290
+ model_params.n_seq = cache_k->ne[2];
291
+ }
292
+
293
+ compute_params.n_seq_active = mask->ne[3];
294
+ auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type);
295
+ size_t offset;
296
+ memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
297
+ compute_params.seq_active_start = offset / seq_size;
298
+ compute_params.token_len_per_seq = node->ne[2];
299
+
300
+ if (mask_name.find("swa") != std::string::npos) {
301
+ compute_params.attention_size_swa = mask->ne[0];
302
+ } else {
303
+ compute_params.attention_size = mask->ne[0];
304
+ }
305
+ if (is_static) {
306
+ compute_params.attention_size = model_params.ctx_per_seq;
307
+ compute_params.attention_size_swa = model_params.ctx_per_seq_swa;
308
+ compute_params.token_len_per_seq = 1;
309
+ }
310
+ break;
311
+ }
312
+ if (node->op == GGML_OP_ROPE) {
313
+ memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
314
+ }
315
+ }
316
+ auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
317
+ compute_params.output_len = output_tensor->ne[1];
318
+ // for NPU, output_len is always 1 except for llama-perplexity
319
+ if (is_static && compute_params.output_len == 0) {
320
+ compute_params.output_len = 1;
321
+ }
322
+ model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
323
+ model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq;
324
+ return {model_params, compute_params};
325
+ }
326
+
327
+ void GgmlOvDecoder::validate_cgraph() const {
328
+ if (m_model_params.n_seq > 1 && m_is_static == true) {
329
+ throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
330
+ }
331
+ }
332
+
333
+ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
334
+ if (m_naive) {
335
+ return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
336
+ }
337
+ auto name = std::string(input->name);
338
+ ov::PartialShape input_shape;
339
+
340
+ if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
341
+ // tokens or positions
342
+ int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
343
+ input_shape = ov::PartialShape{1, 1, 1, len};
344
+
345
+ } else if (is_output_idx(input, op)) {
346
+ // output index
347
+ input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
348
+
349
+ } else if (is_inp_mask(input, op)) {
350
+ // mask
351
+ if (m_is_static) {
352
+ input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
353
+ } else if (m_is_stateful) {
354
+ input_shape = ov::PartialShape{1, 1, -1, -1};
355
+ } else {
356
+ input_shape = ov::PartialShape{-1, 1, -1, -1};
357
+ }
358
+
359
+ } else if (is_kvcache(input, op)) {
360
+ // kvcache
361
+ input_shape = ov::PartialShape{get_shape(input)};
362
+ if (!m_is_static) {
363
+ // do not fix ctx size to make llama-bench work across test params
364
+ input_shape[2] = -1;
365
+ }
366
+ if (is_stateful()) {
367
+ // Convert stateless KV cache layout [1, 1, seq, n_heads_kv * head_size]
368
+ // to stateful layout [1, seq, n_heads_kv, head_size].
369
+ assert(input_shape.size() == 4 && input_shape[0] == 1 && input_shape[1] == 1 &&
370
+ input_shape[2].is_dynamic() &&
371
+ input_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size));
372
+ input_shape = {input_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv,
373
+ m_model_params.head_size};
374
+ }
375
+
376
+ } else if (is_kv_idx(input, op)) {
377
+ // kv update index
378
+ int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
379
+ input_shape = ov::PartialShape{1, 1, 1, len};
380
+
381
+ } else {
382
+ input_shape = ov::PartialShape{get_shape(input)};
383
+ }
384
+ return input_shape;
385
+ }
386
+
387
+ void GgmlOvDecoder::add_extra_inputs() {
388
+ // Extra inputs:
389
+ // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
390
+ // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
391
+ // 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch
392
+
393
+ auto create_1d_input = [this](const std::string & name, int64_t value) {
394
+ if (m_is_static) {
395
+ auto constant =
396
+ std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
397
+ constant->set_friendly_name(name);
398
+ m_model_extra_inputs[name] = constant;
399
+ } else {
400
+ auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
401
+ param_node->set_friendly_name(name);
402
+ param_node->output(0).get_tensor().set_names({name});
403
+ m_model_extra_inputs[name] = param_node;
404
+
405
+ auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
406
+ *tensor->data<int64_t>() = value;
407
+ m_model_extra_input_values[name] = tensor;
408
+ }
409
+ };
410
+
411
+ create_1d_input("attention_size", m_compute_params.attention_size);
412
+ if (m_compute_params.attention_size_swa != -1) {
413
+ create_1d_input("attention_size_swa", m_compute_params.attention_size_swa);
414
+ }
415
+ create_1d_input("n_seq_active", m_compute_params.n_seq_active);
416
+ create_1d_input("seq_active_start", m_compute_params.seq_active_start);
417
+ create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active);
418
+ create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq);
419
+ // create_1d_input("token_len", m_token_len_per_seq * m_n_seq_active);
420
+ }
421
+
422
+ bool GgmlOvDecoder::node_is_used_as_src(const int node_idx) {
423
+ ggml_tensor * node = m_cgraph->nodes[node_idx];
424
+ for (int i = node_idx; i < m_cgraph->n_nodes; i++) {
425
+ ggml_tensor * other_node = m_cgraph->nodes[i];
426
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
427
+ if (other_node->src[j] == node) {
428
+ return true;
429
+ }
430
+ }
431
+ }
432
+ return false;
433
+ }
434
+
435
+ void GgmlOvDecoder::compute_model_inputs() {
436
+ m_model_inputs.clear();
437
+ m_inputs.clear();
438
+ for (int i = 0; i < m_cgraph->n_nodes; i++) {
439
+ ggml_tensor * node = m_cgraph->nodes[i];
440
+ // the node op is NONE means this node maybe as input of later nodes, we should add it to model inputs for this node.
441
+ if (node->op == GGML_OP_NONE && node_is_used_as_src(i)) {
442
+ std::string node_name(node->name);
443
+ if (m_model_weights.find(node_name) == m_model_weights.end()) {
444
+ m_inputs[node_name] = node;
445
+ auto param_node =
446
+ std::make_shared<ov::op::v0::Parameter>(get_ov_type(node), get_graph_input_shape(node, nullptr));
447
+ param_node->set_friendly_name(node_name);
448
+ param_node->output(0).get_tensor().set_names({node_name});
449
+ m_model_inputs[node_name] = param_node;
450
+ }
451
+ continue;
452
+ }
453
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
454
+ auto * src = node->src[i];
455
+ if (src == nullptr) {
456
+ continue;
457
+ }
458
+ std::string src_name = std::string(src->name);
459
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
460
+ src_name = get_graph_input_ov_name(src, node);
461
+ }
462
+ if (m_model_weights.find(src_name) != m_model_weights.end()) {
463
+ continue;
464
+ }
465
+
466
+ bool is_intermediate_node = false;
467
+ for (const auto & node_info : m_node_info_list) {
468
+ if (node_info.node == src) {
469
+ is_intermediate_node = true;
470
+ break;
471
+ }
472
+ }
473
+ if (is_intermediate_node) {
474
+ continue;
475
+ }
476
+ if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
477
+ continue;
478
+ }
479
+
480
+ m_inputs[src_name] = src;
481
+
482
+ ggml_backend_buffer * buffer = src->buffer;
483
+ // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
484
+ if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
485
+ if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
486
+ it == m_model_params.kv_names.end()) {
487
+ m_model_params.kv_names.push_back(src_name);
488
+ }
489
+ }
490
+ ov::PartialShape param_shape = get_graph_input_shape(node, src);
491
+ auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
492
+ param_node->set_friendly_name(src_name);
493
+ param_node->output(0).get_tensor().set_names({src_name});
494
+ m_model_inputs[src_name] = param_node;
495
+ }
496
+ }
497
+ }
498
+
499
+ void GgmlOvDecoder::compute_model_outputs() {
500
+ m_model_outputs.clear();
501
+ m_model_output_names.clear();
502
+ for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
503
+ auto * cur_node = m_cgraph->nodes[node_n];
504
+ // if the node op is NONE means this node is not used at all, we can skip it directly without adding to model outputs.
505
+ if (cur_node->op == GGML_OP_NONE) {
506
+ continue;
507
+ }
508
+ auto cur_node_use_count = m_cgraph->use_counts[ggml_hash_find(&m_cgraph->visited_hash_set, cur_node)];
509
+ if (cur_node_use_count == 0) {
510
+ // The output of SET_ROWS is the view_src tensor, which is updated in place. We should use the view_src name as the output name to make sure it can be correctly matched with the later ops that use the view_src.
511
+ if (cur_node != nullptr && cur_node->op == GGML_OP_SET_ROWS) {
512
+ cur_node = cur_node->view_src;
513
+ }
514
+ } else {
515
+ int input_use_count = 0;
516
+ for (int i = 0; i < m_cgraph->n_nodes; i++) {
517
+ ggml_tensor * node = m_cgraph->nodes[i];
518
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
519
+ if (node->src[j] != NULL && node->src[j] == cur_node) {
520
+ input_use_count++;
521
+ }
522
+ }
523
+ }
524
+ if (input_use_count == cur_node_use_count) {
525
+ cur_node = nullptr;
526
+ }
527
+ }
528
+ if (cur_node != nullptr) {
529
+ std::string node_output_name(cur_node->name);
530
+ m_model_outputs[node_output_name] = cur_node;
531
+ m_model_output_names.push_back(node_output_name);
532
+ }
533
+ }
534
+ }
535
+
536
+ const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const {
537
+ if (tensor == nullptr) {
538
+ return nullptr;
539
+ }
540
+ for (int i = 0; i < m_cgraph->n_nodes; i++) {
541
+ const auto * node = m_cgraph->nodes[i];
542
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
543
+ if (node->src[j] == tensor) {
544
+ return node;
545
+ }
546
+ }
547
+ }
548
+ return nullptr;
549
+ }
550
+
551
+ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const {
552
+ for (int i = 0; i < m_cgraph->n_nodes; i++) {
553
+ const auto * node = m_cgraph->nodes[i];
554
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
555
+ const auto * src = node->src[j];
556
+ if (src == nullptr) {
557
+ break;
558
+ }
559
+ if (std::string(src->name) == name) {
560
+ return src;
561
+ }
562
+ }
563
+ }
564
+ return nullptr;
565
+ }
566
+
567
+ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
568
+ std::map<std::string, std::string> kv_param_res_names;
569
+ for (const auto & name : m_model_params.kv_names) {
570
+ kv_param_res_names[name] = name;
571
+ }
572
+ return kv_param_res_names;
573
+ }
574
+
575
+ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
576
+ static std::mutex weights_mutex;
577
+ std::lock_guard<std::mutex> lock(weights_mutex);
578
+
579
+ std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
580
+ auto * nodes = cgraph->nodes;
581
+ auto n_nodes = cgraph->n_nodes;
582
+ for (int node_i = 0; node_i < n_nodes; node_i++) {
583
+ auto * node = nodes[node_i];
584
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
585
+ auto * src = node->src[i];
586
+ if (src == nullptr) {
587
+ continue;
588
+ }
589
+
590
+ std::string src_name(src->name);
591
+ if (is_rope_freqs_weight(src, node)) {
592
+ src_name = "rope_freqs.weight";
593
+ }
594
+ if (!src->view_src) {
595
+ ggml_backend_buffer * buffer = src->buffer;
596
+ if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
597
+ if (model_weights.find(src_name) == model_weights.end()) {
598
+ auto weight_node = create_weight_node(src, naive);
599
+ weight_node->set_friendly_name(src_name);
600
+ model_weights[src_name] = weight_node;
601
+ }
602
+ }
603
+ }
604
+ }
605
+ }
606
+ return model_weights;
607
+ }
608
+
609
+ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
610
+ const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
611
+
612
+ // Check if we have a pre-built constant from the OpenVINO backend buffer
613
+ // This is set during ggml_backend_openvino_buffer_set_tensor
614
+ if (tensor->extra) {
615
+ OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) +
616
+ " Possibly this is a cpu backend repacked quantized weights");
617
+ // Cast to our extra base type and check the type
618
+ auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
619
+
620
+ if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
621
+ // F16/F32/BF16 weight with shared-memory constant
622
+ auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
623
+ if (weight_extra->weight_node) {
624
+ // GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
625
+ return weight_extra->weight_node;
626
+ }
627
+ } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
628
+ // Quantized weight with pre-extracted data
629
+ auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
630
+ if (quant_extra->weight_node) {
631
+ // GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
632
+ return quant_extra->weight_node;
633
+ }
634
+ }
635
+ }
636
+
637
+ // There are three cases where we need to create a new weight node:
638
+ // 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
639
+ // 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
640
+ // 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node
641
+
642
+ // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
643
+ static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
644
+ GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
645
+ GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
646
+ if (weight_types.find(tensor->type) == weight_types.end()) {
647
+ throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
648
+ ggml_type_name(tensor->type));
649
+ }
650
+
651
+ OvWeight ov_weight;
652
+ if (ggml_is_quantized(tensor->type)) {
653
+ auto use_bias = naive;
654
+ if (is_ov_buffer) {
655
+ // For quantized weights, copy raw data to a temp buffer first because
656
+ // process_weight_tensor reads from data and writes extracted results
657
+ // (weights/scales/zp) to output_base_ptr — they would overlap if both
658
+ // point to tensor->data.
659
+ size_t raw_size = ggml_nbytes(tensor);
660
+ std::vector<uint8_t> tmp(raw_size);
661
+ memcpy(tmp.data(), tensor->data, raw_size);
662
+ ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
663
+ } else {
664
+ ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
665
+ }
666
+ } else {
667
+ // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
668
+ // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
669
+ ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
670
+ }
671
+
672
+ ov_weight.weight_node->set_friendly_name(tensor->name);
673
+ if (!is_ov_buffer) {
674
+ return ov_weight.weight_node;
675
+ }
676
+
677
+ ggml_openvino_extra_base * extra;
678
+ if (ov_weight.is_quantized()) {
679
+ extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
680
+ std::move(ov_weight.zp), ov_weight.weight_node);
681
+ } else {
682
+ extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
683
+ }
684
+ ggml_openvino_buffer_register_extra(tensor, extra);
685
+
686
+ return ov_weight.weight_node;
687
+ }
688
+
689
+ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
690
+ std::ofstream file(filename);
691
+ if (!file.is_open()) {
692
+ std::cerr << "Failed to open file" << std::endl;
693
+ return;
694
+ }
695
+
696
+ file << "=== GRAPH ===\n";
697
+
698
+ // clang-format off
699
+ file << "n_nodes = " << cgraph->n_nodes << "\n";
700
+ file << " " << std::setw(3) << "nodes"
701
+ << std::setw(15) << "shape"
702
+ << std::setw(20) << "op"
703
+ << std::setw(20) << "name"
704
+ << std::setw(3) << " "
705
+ << std::setw(62) << "stride"
706
+ << std::setw(20) << "buffer_type"
707
+ << "\n";
708
+ for (int i = 0; i < cgraph->n_nodes; i++) {
709
+ ggml_tensor * node = cgraph->nodes[i];
710
+
711
+ // Get buffer type name
712
+ const char * buf_name = "none";
713
+ ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer;
714
+ if (buf) {
715
+ buf_name = ggml_backend_buffer_name(buf);
716
+ }
717
+
718
+ file << " - " << std::setw(3) << i << ": [ "
719
+ << std::setw(5) << node->ne[0] << ", "
720
+ << std::setw(5) << node->ne[1] << ", "
721
+ << std::setw(5) << node->ne[2] << ", "
722
+ << std::setw(5) << node->ne[3] << "] "
723
+ << std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " "
724
+ << std::left << std::setw(45) << node->name << std::right
725
+ << std::setw(2) << "[ "
726
+ << std::setw(0) << node->nb[0] << ", "
727
+ << std::setw(5) << node->nb[1] << ", "
728
+ << std::setw(5) << node->nb[2] << ", "
729
+ << std::setw(5) << node->nb[3] << "] "
730
+ << std::right << std::setw(15) << buf_name << std::right
731
+ << "\n";
732
+
733
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
734
+ if (auto* src = node->src[i]) {
735
+ // Get buffer type name for source
736
+ const char * src_buf_name = "none";
737
+ ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer;
738
+ if (src_buf) {
739
+ src_buf_name = ggml_backend_buffer_name(src_buf);
740
+ }
741
+
742
+ file << std::setw(10) << " [ "
743
+ << std::setw(5) << src->ne[0] << ", "
744
+ << std::setw(5) << src->ne[1] << ", "
745
+ << std::setw(5) << src->ne[2] << ", "
746
+ << std::setw(5) << src->ne[3] << "] "
747
+ << std::setw(12)
748
+ << i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right;
749
+ file << std::left << std::setw(30) << src->name << std::right
750
+ << std::setw(16) << "[ "
751
+ << std::setw(0) << src->nb[0] << ", "
752
+ << std::setw(5) << src->nb[1] << ", "
753
+ << std::setw(5) << src->nb[2] << ", "
754
+ << std::setw(5) << src->nb[3] << "] "
755
+ << std::right << std::setw(15) << src_buf_name << std::right
756
+ << "\n";
757
+ }
758
+ }
759
+ }
760
+
761
+ file << "n_leafs = " << cgraph->n_leafs << "\n";
762
+ for (int i = 0; i < cgraph->n_leafs; i++) {
763
+ ggml_tensor * node = cgraph->leafs[i];
764
+
765
+ // Get buffer type name for leaf
766
+ const char * leaf_buf_name = "none";
767
+ ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer;
768
+ if (leaf_buf) {
769
+ leaf_buf_name = ggml_backend_buffer_name(leaf_buf);
770
+ }
771
+
772
+ file << " - " << std::setw(3) << i << ": [ "
773
+ << std::setw(5) << node->ne[0] << ", "
774
+ << std::setw(5) << node->ne[1] << "] "
775
+ << std::setw(8) << ggml_op_name(node->op) << " "
776
+ << std::setw(16) << ggml_get_name(node)
777
+ << std::setw(20) << leaf_buf_name << "\n";
778
+ }
779
+ // clang-format on
780
+ file << "========================================\n";
781
+
782
+ file.close();
783
+ }
784
+
785
+ void print_tensor_address_map(const ggml_cgraph * cgraph) {
786
+ std::map<void *, std::vector<std::string>> address_map;
787
+ for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
788
+ auto * node = cgraph->nodes[node_n];
789
+ if (node->data) {
790
+ auto it = address_map.find(node->data);
791
+ if (it == address_map.end()) {
792
+ address_map[node->data] = std::vector<std::string>();
793
+ }
794
+ address_map[node->data].push_back(node->name);
795
+ }
796
+ }
797
+ for (const auto & pair : address_map) {
798
+ std::cout << "Address: " << pair.first << std::endl;
799
+ for (const auto & name : pair.second) {
800
+ std::cout << name << " ; ";
801
+ }
802
+ std::cout << std::endl << std::endl;
803
+ }
804
+ }
805
+
806
+ ov::Shape GgmlOvDecoder::get_shape(const ggml_tensor * tensor) {
807
+ std::vector<size_t> shape;
808
+ for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
809
+ shape.push_back(static_cast<size_t>(tensor->ne[i]));
810
+ }
811
+ return shape;
812
+ }
813
+
814
+ std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor * tensor) {
815
+ std::vector<size_t> stride;
816
+ for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
817
+ stride.push_back(static_cast<size_t>(tensor->nb[i]));
818
+ }
819
+ return stride;
820
+ }
821
+
822
+ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) {
823
+ switch (tensor->type) {
824
+ case GGML_TYPE_F64:
825
+ return ov::element::f64;
826
+ case GGML_TYPE_F32:
827
+ return ov::element::f32;
828
+ case GGML_TYPE_F16:
829
+ return ov::element::f16;
830
+ case GGML_TYPE_BF16:
831
+ return ov::element::bf16;
832
+ case GGML_TYPE_I8:
833
+ return ov::element::i8;
834
+ case GGML_TYPE_I16:
835
+ return ov::element::i16;
836
+ case GGML_TYPE_I32:
837
+ return ov::element::i32;
838
+ case GGML_TYPE_I64:
839
+ return ov::element::i64;
840
+ default:
841
+ return ov::element::dynamic;
842
+ }
843
+ }
844
+
845
+ ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string & name) const {
846
+ return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name)));
847
+ }
848
+
849
+ std::vector<size_t> GgmlOvDecoder::get_input_stride(int node_idx, const std::string & name) const {
850
+ return get_stride(m_node_info_list[node_idx].node_inputs.at(name));
851
+ }
852
+
853
+ ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const {
854
+ return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name));
855
+ }
856
+
857
+ size_t GgmlOvDecoder::get_input_size() const {
858
+ return m_model_inputs.size();
859
+ }
860
+
861
+ size_t GgmlOvDecoder::get_input_size(int node_idx) const {
862
+ return m_node_info_list[node_idx].node_inputs_names.size();
863
+ }
864
+
865
+ std::vector<std::string> GgmlOvDecoder::get_input_names(int node_idx) const {
866
+ return m_node_info_list[node_idx].node_inputs_names;
867
+ }
868
+
869
+ ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const {
870
+ auto * ggml_tensor = m_node_info_list[node_idx].node_output;
871
+ return ov::PartialShape(get_shape(ggml_tensor));
872
+ }
873
+
874
+ ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const {
875
+ return get_ov_type(m_node_info_list[node_idx].node);
876
+ }
877
+
878
+ std::vector<std::string> GgmlOvDecoder::get_output_names(int node_idx) const {
879
+ return {m_node_info_list[node_idx].node_output_name};
880
+ }
881
+
882
+ const std::string & GgmlOvDecoder::get_op_name() const {
883
+ static const std::string unknown_name = "UNKNOWN_OP_NAME";
884
+ return unknown_name;
885
+ }
886
+
887
+ const std::string & GgmlOvDecoder::get_op_name(int node_idx) const {
888
+ return m_node_info_list[node_idx].node_name;
889
+ }
890
+
891
+ int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & name) const {
892
+ return m_node_info_list[node_idx].node_inputs.at(name)->op_params;
893
+ }
894
+
895
+ int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const {
896
+ return m_node_info_list[node_idx].node->op_params;
897
+ }
898
+
899
+ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const {
900
+ for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) {
901
+ if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) {
902
+ continue;
903
+ }
904
+ node_visitor(std::make_shared<GgmlOvDecoder>(*this), node_idx);
905
+ }
906
+ }
907
+
908
+ std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
909
+ static const std::map<ggml_op, std::string> ops = {
910
+ {GGML_OP_NONE, "GGML_OP_NONE" },
911
+ {GGML_OP_ACC, "GGML_OP_ACC" },
912
+ {GGML_OP_ADD, "GGML_OP_ADD" },
913
+ {GGML_OP_ADD1, "GGML_OP_ADD1" },
914
+ {GGML_OP_CONT, "GGML_OP_CONT" },
915
+ {GGML_OP_DIV, "GGML_OP_DIV" },
916
+ {GGML_OP_DUP, "GGML_OP_DUP" },
917
+ {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },
918
+ {GGML_OP_MUL, "GGML_OP_MUL" },
919
+ {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" },
920
+ {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" },
921
+ {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" },
922
+ {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" },
923
+ {GGML_OP_ROPE, "GGML_OP_ROPE" },
924
+ {GGML_OP_SCALE, "GGML_OP_SCALE" },
925
+ {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" },
926
+ {GGML_OP_SUB, "GGML_OP_SUB" },
927
+ {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" },
928
+ {GGML_OP_VIEW, "GGML_OP_VIEW" },
929
+ {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" },
930
+ {GGML_OP_CPY, "GGML_OP_CPY" },
931
+ {GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT"},
932
+ };
933
+ static const std::map<ggml_unary_op, std::string> unary_ops = {
934
+ {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" },
935
+ {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" },
936
+ {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" },
937
+ {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" },
938
+ {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" },
939
+ {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" },
940
+ {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" },
941
+ {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" },
942
+ {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" },
943
+ {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" },
944
+ {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" },
945
+ {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" },
946
+ {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
947
+ {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" },
948
+ {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" }
949
+ };
950
+ static const std::map<ggml_glu_op, std::string> glu_ops = {
951
+ {GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
952
+ {GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" },
953
+ {GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" }
954
+ };
955
+
956
+ switch (node->op) {
957
+ case GGML_OP_UNARY:
958
+ return unary_ops.at(ggml_get_unary_op(node));
959
+ case GGML_OP_GLU:
960
+ return glu_ops.at(ggml_get_glu_op(node));
961
+ default:
962
+ return ops.at(node->op);
963
+ }
964
+ static const std::string unknown_op = "UNKNOWN_GGML_OP";
965
+ return unknown_op;
966
+ }
967
+
968
+ const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
969
+ return m_node_info_list[node_idx].node_op_type;
970
+ }
971
+
972
+ const std::string & GgmlOvDecoder::get_op_type() const {
973
+ static const std::string unknown_op = "UNKNOWN_GGML_OP";
974
+ return unknown_op;
975
+ }