whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (610) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +99 -2
  4. data/ext/extconf.rb +1 -0
  5. data/ext/ruby_whisper.c +20 -4
  6. data/ext/ruby_whisper.h +30 -2
  7. data/ext/ruby_whisper_context.c +216 -124
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +0 -1
  10. data/ext/ruby_whisper_params.c +0 -1
  11. data/ext/ruby_whisper_segment.c +0 -1
  12. data/ext/ruby_whisper_token.c +29 -9
  13. data/ext/ruby_whisper_transcribe.cpp +4 -1
  14. data/ext/ruby_whisper_vad_context.c +48 -1
  15. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +0 -1
  18. data/ext/ruby_whisper_vad_segments.c +0 -1
  19. data/ext/sources/CMakeLists.txt +1 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  22. data/ext/sources/examples/bench/bench.cpp +23 -18
  23. data/ext/sources/examples/cli/cli.cpp +8 -0
  24. data/ext/sources/examples/common-ggml.cpp +2 -0
  25. data/ext/sources/examples/miniaudio.h +4507 -2131
  26. data/ext/sources/examples/server/server.cpp +18 -4
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
  31. data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
  33. data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
  34. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  35. data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
  36. data/ext/sources/examples/talk-llama/llama-context.h +27 -28
  37. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
  38. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
  40. data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
  41. data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
  42. data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
  43. data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
  44. data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
  45. data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
  49. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  50. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  51. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
  52. data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
  53. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
  54. data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
  55. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
  56. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  57. data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
  58. data/ext/sources/examples/talk-llama/llama-model.h +72 -19
  59. data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
  60. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
  61. data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
  62. data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
  63. data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
  64. data/ext/sources/examples/talk-llama/llama.cpp +76 -22
  65. data/ext/sources/examples/talk-llama/llama.h +63 -30
  66. data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
  67. data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
  68. data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
  69. data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
  70. data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
  71. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
  72. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
  73. data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
  74. data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
  75. data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
  76. data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
  77. data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
  78. data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
  79. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
  80. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
  81. data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
  82. data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
  83. data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
  84. data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
  85. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
  86. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  87. data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
  88. data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
  89. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
  90. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
  91. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  92. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
  108. data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
  109. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
  110. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
  111. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
  112. data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
  113. data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
  114. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
  116. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  117. data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
  118. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
  119. data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
  120. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
  121. data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
  122. data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
  123. data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
  124. data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
  125. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
  126. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
  127. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
  128. data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
  129. data/ext/sources/examples/talk-llama/models/models.h +181 -46
  130. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
  131. data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
  132. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
  133. data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
  134. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
  135. data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
  136. data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
  137. data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
  138. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
  139. data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
  140. data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
  141. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  142. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
  143. data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
  144. data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
  145. data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
  146. data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
  147. data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
  148. data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
  149. data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
  150. data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
  151. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
  152. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
  153. data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
  154. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  155. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  156. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
  157. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
  158. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
  159. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
  160. data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
  161. data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
  162. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
  163. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
  164. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
  165. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
  166. data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
  167. data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
  168. data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
  169. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
  170. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  171. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
  172. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
  173. data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
  174. data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
  175. data/ext/sources/ggml/CMakeLists.txt +9 -3
  176. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  177. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  179. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  180. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  181. data/ext/sources/ggml/include/ggml-rpc.h +6 -1
  182. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  183. data/ext/sources/ggml/include/ggml.h +56 -9
  184. data/ext/sources/ggml/src/CMakeLists.txt +3 -0
  185. data/ext/sources/ggml/src/ggml-alloc.c +4 -9
  186. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  187. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  188. data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
  189. data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  191. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
  192. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  193. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  194. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
  195. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
  196. data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
  197. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
  198. data/ext/sources/ggml/src/ggml-common.h +11 -0
  199. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
  200. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
  201. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  202. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  203. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
  204. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
  205. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  208. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  209. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
  210. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
  211. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  212. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  213. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
  214. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
  215. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
  216. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  217. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
  218. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
  219. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
  220. data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
  221. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  222. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  223. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
  224. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  225. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  226. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
  227. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  228. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
  229. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
  231. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
  232. data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
  233. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
  235. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
  236. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
  237. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
  238. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
  239. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  241. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
  242. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  243. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  244. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
  245. data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
  246. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
  247. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  248. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
  249. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
  250. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
  251. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  252. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
  253. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
  254. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  255. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  256. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
  257. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  258. data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
  259. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  260. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  261. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
  262. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  269. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
  270. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
  271. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
  272. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
  274. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
  278. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  297. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  317. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  321. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
  322. data/ext/sources/ggml/src/ggml-impl.h +62 -0
  323. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  324. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  325. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  326. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  327. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
  328. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
  329. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
  330. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
  335. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
  336. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  367. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  368. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  369. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  370. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  371. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  372. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  373. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  374. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  375. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  376. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  377. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  378. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  379. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  380. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  381. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  382. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  383. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  384. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  385. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  386. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  387. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  388. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  389. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  390. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  391. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  392. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  393. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  394. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  395. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  396. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  397. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  398. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  399. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  400. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  401. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  402. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  403. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  404. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  405. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  406. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  407. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  412. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  413. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  414. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  415. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  416. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
  417. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  418. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  420. data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
  421. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
  422. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  423. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  424. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  425. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
  426. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  427. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  428. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  429. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  430. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  431. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  432. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  433. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  434. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  435. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
  436. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  437. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  438. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  439. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  440. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  441. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  442. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  443. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  444. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  445. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  446. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  447. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  448. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  449. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  450. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  451. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  452. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  453. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  454. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  455. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  456. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  457. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  458. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  459. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  460. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  461. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  462. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  463. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  464. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  465. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  466. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  467. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  468. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  469. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  470. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  471. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  472. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  473. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  474. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  475. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  476. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  477. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  478. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  479. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  480. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  481. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  482. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  483. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  484. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  485. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  486. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  487. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  488. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
  489. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  490. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  491. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  492. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  493. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  494. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  495. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  496. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  497. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  498. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  499. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  500. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  501. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  502. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  503. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  504. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  505. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  506. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  507. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  508. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  509. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  510. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  511. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  512. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  513. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  514. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  515. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  516. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  517. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  518. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  519. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  520. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  521. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  522. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  523. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  524. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  525. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  526. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  527. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  528. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  529. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  530. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  531. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  532. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
  533. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
  560. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
  561. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
  562. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  563. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  564. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  565. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  566. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  567. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  568. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
  569. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  570. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
  571. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
  572. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  573. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
  574. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  584. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  585. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  586. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
  587. data/ext/sources/ggml/src/ggml.c +167 -33
  588. data/ext/sources/ggml/src/gguf.cpp +229 -44
  589. data/ext/sources/src/whisper.cpp +6 -28
  590. data/sig/whisper.rbs +43 -2
  591. data/test/test_context_params.rb +82 -0
  592. data/test/test_token.rb +11 -0
  593. data/test/test_vad_context.rb +58 -8
  594. data/test/test_whisper.rb +20 -0
  595. data/whispercpp.gemspec +1 -1
  596. metadata +240 -28
  597. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  598. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  599. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  600. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  601. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  602. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  603. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  604. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  605. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  606. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  607. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  608. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  609. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  610. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
@@ -0,0 +1,884 @@
1
+ #include "ggml-quants.h"
2
+
3
+ #include "ggml-common.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml.h"
6
+
7
+ #include <algorithm>
8
+ #include <cassert>
9
+ #include <cmath>
10
+ #include <cstddef>
11
+ #include <cstdint>
12
+ #include <limits>
13
+ #include <memory>
14
+ #include <openvino/core/except.hpp>
15
+ #include <openvino/core/node.hpp>
16
+ #include <openvino/core/node_output.hpp>
17
+ #include <openvino/core/parallel.hpp>
18
+ #include <openvino/core/shape.hpp>
19
+ #include <openvino/core/type/element_type.hpp>
20
+ #include <openvino/core/type/element_type_traits.hpp>
21
+ #include <openvino/core/type/float16.hpp>
22
+ #include <openvino/op/add.hpp>
23
+ #include <openvino/op/constant.hpp>
24
+ #include <openvino/op/convert.hpp>
25
+ #include <openvino/op/multiply.hpp>
26
+ #include <openvino/op/reshape.hpp>
27
+ #include <openvino/op/subtract.hpp>
28
+ #include <openvino/op/util/attr_types.hpp>
29
+ #include <openvino/runtime/tensor.hpp>
30
+ #include <string>
31
+ #include <vector>
32
+
33
+ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
34
+ std::fill_n(dst, 16, 0);
35
+ for (int j = 0; j < 16; ++j) {
36
+ uint8_t x = (data[j] & 0x0F);
37
+ uint8_t y = (data[j] >> 4);
38
+ if (j % 2 != 0) {
39
+ x <<= 4;
40
+ y <<= 4;
41
+ }
42
+ dst[j / 2] |= x;
43
+ dst[8 + j / 2] |= y; // Last 16 weights are in the higher bits
44
+ }
45
+ }
46
+
47
+ // Extracts (weight, scales, zp) from Q4_0 tensors.
48
+ // Data layout is: |16 bit scale|32 x 4bit weights|.
49
+ void extract_q4_0_data(const ggml_tensor * tensor,
50
+ ov::Tensor & weights_arr,
51
+ ov::Tensor & scales_arr,
52
+ ov::Tensor & zp_arr) {
53
+ const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
54
+
55
+ auto * data = static_cast<uint8_t *>(tensor->data);
56
+ auto * weights = static_cast<uint8_t *>(weights_arr.data());
57
+ auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
58
+ auto * zp = static_cast<uint8_t *>(zp_arr.data());
59
+
60
+ bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
61
+
62
+ // For Q4_0, zero point is always 8
63
+ if (is_scalar_zp) {
64
+ zp[0] = 8 | (8 << 4); // Pack two 4-bit values
65
+ }
66
+
67
+ ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
68
+ scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
69
+ // For asymmetric quantization, compute per-block zero points
70
+ if (!is_scalar_zp) {
71
+ // Pack two 4-bit zero points per byte
72
+ if (i % 2 == 0) {
73
+ zp[i / 2] = 8; // Lower nibble
74
+ } else {
75
+ zp[i / 2] |= (8 << 4); // Upper nibble
76
+ }
77
+ }
78
+ unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
79
+ });
80
+ }
81
+
82
+ // Extracts (weight, scales, zp) from Q4_1 tensors.
83
+ // Data layout is: |16 bit scale|16 bit min|32 x 4bit weights|.
84
+ void extract_q4_1_data(const ggml_tensor * tensor,
85
+ ov::Tensor & weights_arr,
86
+ ov::Tensor & scales_arr,
87
+ ov::Tensor & zp_arr,
88
+ bool use_bias) {
89
+ const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights
90
+
91
+ auto * data = static_cast<uint8_t *>(tensor->data);
92
+ auto * weights = static_cast<uint8_t *>(weights_arr.data());
93
+ auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
94
+
95
+ if (use_bias) {
96
+ // Store bias (min) directly as f16 instead of computing u4 zero points
97
+ auto * bias = zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
98
+ ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
99
+ float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
100
+ float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
101
+ scales[i] = ov::float16(scale);
102
+ bias[i] = ov::float16(min); // bias = min, dequant: w*s + bias
103
+ unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
104
+ });
105
+ } else {
106
+ auto * zp = static_cast<uint8_t *>(zp_arr.data());
107
+ ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
108
+ float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
109
+ float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
110
+ scales[i] = ov::float16(scale);
111
+ // zp = -min / scale (bias = min, so zp = -bias/scale)
112
+ uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
113
+ // Pack two 4-bit zero points per byte
114
+ if (i % 2 == 0) {
115
+ zp[i / 2] = zp_val & 0x0F; // Lower nibble
116
+ } else {
117
+ zp[i / 2] |= (zp_val << 4); // Upper nibble
118
+ }
119
+ unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
120
+ });
121
+ }
122
+ }
123
+
124
+ // Extracts (weight, scales, zp) from Q8_0 tensors.
125
+ // Data layout is: |16 bit scale|32 x 8bit weights|.
126
+ void extract_q8_0_data(const ggml_tensor * tensor,
127
+ ov::Tensor & weights_arr,
128
+ ov::Tensor & scales_arr,
129
+ ov::Tensor & zp_arr) {
130
+ const uint64_t weights_per_block = 32;
131
+ const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
132
+
133
+ auto * data = static_cast<uint8_t *>(tensor->data);
134
+ auto * weights = static_cast<uint8_t *>(weights_arr.data());
135
+ auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
136
+ auto * zp = static_cast<uint8_t *>(zp_arr.data());
137
+
138
+ bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
139
+
140
+ // For Q8_0, zero point is always 128
141
+ if (is_scalar_zp) {
142
+ zp[0] = 128;
143
+ }
144
+
145
+ ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
146
+ uint8_t * block_data = data + i * bytes_per_block;
147
+ scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
148
+ // For asymmetric quantization, store per-block zero points
149
+ if (!is_scalar_zp) {
150
+ zp[i] = 128;
151
+ }
152
+ for (size_t j = 0; j < weights_per_block; ++j) {
153
+ uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
154
+ // Original data is in int8_t, so we add a bias of -128 and invert the first bit.
155
+ x ^= 1 << 7;
156
+ weights[i * weights_per_block + j] = x;
157
+ }
158
+ });
159
+ }
160
+
161
+ void unpack_256_4(const uint8_t * data, uint8_t * dst) {
162
+ // Initialize the output array with zeros
163
+ std::fill_n(dst, 128, 0);
164
+
165
+ for (size_t i = 0; i < 4; ++i) {
166
+ for (int j = 0; j < 32; ++j) {
167
+ uint8_t x = (data[i * 32 + j] & 0x0F);
168
+ uint8_t y = (data[i * 32 + j] >> 4);
169
+ if (j % 2 != 0) {
170
+ x <<= 4;
171
+ y <<= 4;
172
+ }
173
+ dst[i * 32 + j / 2] |= x;
174
+ dst[i * 32 + 16 + j / 2] |= y; // Last 16 weights are in the higher bits
175
+ }
176
+ }
177
+ }
178
+
179
+ void extract_q4_k_data(const ggml_tensor * tensor,
180
+ ov::Tensor & weights_arr,
181
+ ov::Tensor & scales_arr,
182
+ ov::Tensor & zp_arr,
183
+ bool use_bias) {
184
+ const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
185
+ const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
186
+
187
+ auto * data = static_cast<uint8_t *>(tensor->data);
188
+ auto * weights = static_cast<uint8_t *>(weights_arr.data());
189
+ auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
190
+
191
+ // For bias path, zp_arr holds f16 bias values; for zp path, it holds packed u4 zero points
192
+ auto * zp_u4 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
193
+ auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
194
+
195
+ ov::parallel_for(n_super_block, [&](size_t i) {
196
+ uint8_t * block_data = data + i * bytes_per_block;
197
+
198
+ // Extract scale factors and offsets
199
+ float scale_scales = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
200
+ float scale_mins = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
201
+
202
+ // Extract qs1 and qs2
203
+ uint8_t * qs1 = block_data + 4;
204
+
205
+ // Calculate scales
206
+ float scale_vals[8];
207
+ scale_vals[0] = scale_scales * static_cast<float>((*(qs1) & 0b111111));
208
+ scale_vals[1] = scale_scales * static_cast<float>((*(qs1 + 1) & 0b111111));
209
+ scale_vals[2] = scale_scales * static_cast<float>((*(qs1 + 2) & 0b111111));
210
+ scale_vals[3] = scale_scales * static_cast<float>((*(qs1 + 3) & 0b111111));
211
+ scale_vals[4] = scale_scales * static_cast<float>((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4));
212
+ scale_vals[5] = scale_scales * static_cast<float>((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4));
213
+ scale_vals[6] = scale_scales * static_cast<float>((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4));
214
+ scale_vals[7] = scale_scales * static_cast<float>((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4));
215
+
216
+ // Calculate min values (bias = -min)
217
+ float min_vals[8];
218
+ min_vals[0] = scale_mins * static_cast<float>((*(qs1 + 4) & 0b111111));
219
+ min_vals[1] = scale_mins * static_cast<float>((*(qs1 + 5) & 0b111111));
220
+ min_vals[2] = scale_mins * static_cast<float>((*(qs1 + 6) & 0b111111));
221
+ min_vals[3] = scale_mins * static_cast<float>((*(qs1 + 7) & 0b111111));
222
+ min_vals[4] = scale_mins * static_cast<float>((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4));
223
+ min_vals[5] = scale_mins * static_cast<float>((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4));
224
+ min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4));
225
+ min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4));
226
+
227
+ // Store scales and compute zero points or bias
228
+ for (int j = 0; j < 8; j++) {
229
+ scales[i * 8 + j] = ov::float16(scale_vals[j]);
230
+ if (use_bias) {
231
+ // Store bias = -min directly as f16, dequant: w*s + bias
232
+ bias_f16[i * 8 + j] = ov::float16(-min_vals[j]);
233
+ } else {
234
+ // zp = min / scale (since bias = -min and zp = -bias/scale)
235
+ uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
236
+ // Pack two 4-bit zero points per byte
237
+ size_t idx = i * 8 + j;
238
+ if (idx % 2 == 0) {
239
+ zp_u4[idx / 2] = zp_val & 0x0F;
240
+ } else {
241
+ zp_u4[idx / 2] |= (zp_val << 4);
242
+ }
243
+ }
244
+ }
245
+ unpack_256_4(block_data + 16, weights + i * 128);
246
+ });
247
+ }
248
+
249
+ void extract_q6_k_data(const ggml_tensor * tensor,
250
+ ov::Tensor & weights_arr,
251
+ ov::Tensor & scales_arr,
252
+ ov::Tensor & zp_arr) {
253
+ const uint64_t bytes_per_block = 128 + 64 + 16 + 2;
254
+ const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
255
+
256
+ auto * data = static_cast<uint8_t *>(tensor->data);
257
+ auto * weights = static_cast<uint8_t *>(weights_arr.data());
258
+ auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
259
+ auto * zp = static_cast<uint8_t *>(zp_arr.data());
260
+
261
+ bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
262
+
263
+ // For Q6_K, zero point is always 32
264
+ if (is_scalar_zp) {
265
+ zp[0] = 32;
266
+ }
267
+
268
+ ov::parallel_for(n_super_block, [&](size_t i) {
269
+ uint8_t * block_data = data + i * bytes_per_block;
270
+
271
+ float scale_factor =
272
+ static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2
273
+
274
+ for (size_t j = 0; j < 16; j++) {
275
+ scales[j + i * 16] =
276
+ ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
277
+ // For asymmetric quantization, store per-block zero points
278
+ if (!is_scalar_zp) {
279
+ zp[j + i * 16] = 32;
280
+ }
281
+ }
282
+
283
+ uint8_t * ql = block_data;
284
+ uint8_t * qh = block_data + 128;
285
+
286
+ for (int64_t j = 0; j < 32; ++j) {
287
+ weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
288
+ weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
289
+ weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
290
+ weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
291
+ weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
292
+ weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
293
+ weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
294
+ weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
295
+ }
296
+ });
297
+ }
298
+
299
+ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
300
+ if (j < 4) {
301
+ *d = q[j] & 63;
302
+ *m = q[j + 4] & 63;
303
+ } else {
304
+ *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
305
+ *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
306
+ }
307
+ }
308
+
309
+ void extract_q5_k_data(const ggml_tensor * tensor,
310
+ ov::Tensor & weights_arr,
311
+ ov::Tensor & scales_arr,
312
+ ov::Tensor & zp_arr,
313
+ bool use_bias) {
314
+ const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
315
+ const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
316
+
317
+ auto * data = static_cast<uint8_t *>(tensor->data);
318
+ auto * weights = static_cast<uint8_t *>(weights_arr.data());
319
+ auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
320
+
321
+ // For bias path, zp_arr holds f16 bias values; for zp path, it holds u8 zero points
322
+ auto * zp_u8 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
323
+ auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
324
+
325
+ ov::parallel_for(n_super_block, [&](size_t i) {
326
+ uint8_t * block_data = data + i * bytes_per_block;
327
+
328
+ const float d = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
329
+ const float min_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
330
+
331
+ const uint8_t * scales_data = block_data + 4; // 12 bytes of scales
332
+ const uint8_t * qh = block_data + 4 + 12; // 32 bytes of high bits
333
+ const uint8_t * ql = block_data + 4 + 12 + 32; // 128 bytes of low bits
334
+
335
+ int is = 0;
336
+ uint8_t u1 = 1;
337
+ uint8_t u2 = 2;
338
+
339
+ // Process 2 blocks in one iteration
340
+ for (int j = 0; j < 256; j += 64) { // 256 = QK_K, so 4 iterations of 64
341
+ uint8_t sc;
342
+ uint8_t m;
343
+
344
+ // Get scale and min for first 32 elements
345
+ get_scale_min_k4(is + 0, scales_data, &sc, &m);
346
+ const float d1 = d * sc;
347
+ const float m1 = min_factor * m;
348
+
349
+ // Get scale and min for second 32 elements
350
+ get_scale_min_k4(is + 1, scales_data, &sc, &m);
351
+ const float d2 = d * sc;
352
+ const float m2 = min_factor * m;
353
+
354
+ scales[i * 8 + is] = ov::float16(d1);
355
+ scales[i * 8 + is + 1] = ov::float16(d2);
356
+ if (use_bias) {
357
+ // Store bias = -min directly as f16, dequant: w*s + bias
358
+ bias_f16[i * 8 + is] = ov::float16(-m1);
359
+ bias_f16[i * 8 + is + 1] = ov::float16(-m2);
360
+ } else {
361
+ // zp = min / scale (since bias = -min and zp = -bias/scale)
362
+ zp_u8[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
363
+ zp_u8[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
364
+ }
365
+
366
+ // Extract weights for first 32 elements (matching deq formula exactly)
367
+ for (int l = 0; l < 32; ++l) {
368
+ weights[i * 256 + j + l] = (ql[l] & 0xF) + ((qh[l] & u1) ? 16 : 0);
369
+ }
370
+
371
+ // Extract weights for second 32 elements
372
+ for (int l = 0; l < 32; ++l) {
373
+ weights[i * 256 + j + l + 32] = (ql[l] >> 4) + ((qh[l] & u2) ? 16 : 0);
374
+ }
375
+
376
+ ql += 32;
377
+ is += 2;
378
+ u1 <<= 2;
379
+ u2 <<= 2;
380
+ }
381
+ });
382
+ }
383
+
384
+ // TODO Reorder for make_intX_weights
385
+
386
+ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
387
+ ov::Tensor & scales,
388
+ ov::Tensor & zp,
389
+ size_t group_size,
390
+ bool use_bias) {
391
+ ov::Shape orig_shape = weight.get_shape();
392
+
393
+ // Expand dimensions for scales and zp/bias
394
+ auto scale_shape = scales.get_shape();
395
+ auto zp_shape = zp.get_shape();
396
+ bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
397
+
398
+ ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
399
+
400
+ if (packed_shape[1] == 1) {
401
+ // Requantized channel-wise case
402
+ packed_shape.erase(packed_shape.begin() + 1);
403
+ } else {
404
+ scale_shape.push_back(1);
405
+ scales.set_shape(scale_shape);
406
+ // For symmetric quantization, zp remains scalar (don't resize)
407
+ if (!is_scalar_zp) {
408
+ zp_shape.push_back(1);
409
+ zp.set_shape(zp_shape);
410
+ }
411
+ }
412
+
413
+ // Create graph nodes
414
+ auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
415
+ static_cast<uint8_t *>(weight.data()), nullptr);
416
+ weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
417
+ auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
418
+ auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
419
+
420
+ ov::Output<ov::Node> result;
421
+ if (use_bias && !is_scalar_zp) {
422
+ // Bias path: w * s + b (zp tensor holds f16 bias values)
423
+ auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
424
+ auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
425
+ result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
426
+ } else {
427
+ // Zero point path: (w - zp) * s
428
+ auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
429
+ float zp_value;
430
+ if (ov::op::util::get_single_value(zero_point, zp_value)) {
431
+ zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
432
+ }
433
+ auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
434
+ auto w_zp =
435
+ std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
436
+ result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
437
+ }
438
+
439
+ if (packed_shape.size() != 2) {
440
+ // If not requantized channel-wise case, reshape back to original shape
441
+ auto final_shape =
442
+ std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
443
+ result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
444
+ }
445
+
446
+ return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
447
+ }
448
+
449
+ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
450
+ ov::Tensor & scales,
451
+ ov::Tensor & zp,
452
+ size_t group_size,
453
+ bool use_bias) {
454
+ ov::Shape orig_weight_shape = weight.get_shape();
455
+
456
+ // Expand dimensions for scales and zp/bias
457
+ ov::Shape scale_shape = scales.get_shape();
458
+ auto zp_shape = zp.get_shape();
459
+ bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
460
+
461
+ // Create INT4 weight tensor
462
+ ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
463
+
464
+ if (packed_shape[1] == 1) {
465
+ // Requantized channel-wise case
466
+ packed_shape.erase(packed_shape.begin() + 1);
467
+ } else {
468
+ scale_shape.push_back(1);
469
+ scales.set_shape(scale_shape);
470
+ // For symmetric quantization, zp remains scalar (don't resize)
471
+ if (!is_scalar_zp) {
472
+ zp_shape.push_back(1);
473
+ zp.set_shape(zp_shape);
474
+ }
475
+ }
476
+
477
+ auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
478
+ static_cast<uint8_t *>(weight.data()), nullptr);
479
+ weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
480
+ auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
481
+ auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
482
+
483
+ ov::Output<ov::Node> result;
484
+ if (use_bias && !is_scalar_zp) {
485
+ // Bias path: w * s + b (zp tensor holds f16 bias values)
486
+ auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
487
+ auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
488
+ result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
489
+ } else {
490
+ // Zero point path: (w - zp) * s
491
+ auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
492
+ float zp_value;
493
+ if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
494
+ zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
495
+ }
496
+ auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
497
+ auto w_zp =
498
+ std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
499
+ result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
500
+ }
501
+
502
+ if (packed_shape.size() != 2) {
503
+ // If not requantized channel-wise case, reshape back to original shape
504
+ auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
505
+ orig_weight_shape);
506
+ result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
507
+ }
508
+
509
+ return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
510
+ }
511
+
512
+ // Extract quantized weights from tensor and create weight subgraph
513
+ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
514
+ const void * data,
515
+ ov::Tensor & weights,
516
+ ov::Tensor & scales,
517
+ ov::Tensor & zp,
518
+ bool use_bias) {
519
+ // Create a temporary tensor for extraction functions that read from tensor->data
520
+ ggml_tensor temp_tensor = *tensor;
521
+ temp_tensor.data = const_cast<void *>(data);
522
+
523
+ // Determine block size based on tensor type
524
+ int64_t weights_per_block;
525
+ bool is_u4;
526
+ switch (tensor->type) {
527
+ case GGML_TYPE_Q4_0:
528
+ case GGML_TYPE_Q4_1:
529
+ case GGML_TYPE_Q4_K:
530
+ is_u4 = true;
531
+ weights_per_block = 32;
532
+ break;
533
+ case GGML_TYPE_Q8_0:
534
+ case GGML_TYPE_Q5_K:
535
+ is_u4 = false;
536
+ weights_per_block = 32;
537
+ break;
538
+ case GGML_TYPE_Q6_K:
539
+ is_u4 = false;
540
+ weights_per_block = 16;
541
+ break;
542
+ default:
543
+ throw std::runtime_error("Unsupported quantized type for extraction: " +
544
+ std::string(ggml_type_name(tensor->type)));
545
+ }
546
+
547
+ // Extract quantized data
548
+ switch (tensor->type) {
549
+ case GGML_TYPE_Q4_0:
550
+ extract_q4_0_data(&temp_tensor, weights, scales, zp);
551
+ break;
552
+ case GGML_TYPE_Q4_1:
553
+ extract_q4_1_data(&temp_tensor, weights, scales, zp, use_bias);
554
+ break;
555
+ case GGML_TYPE_Q4_K:
556
+ extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias);
557
+ break;
558
+ case GGML_TYPE_Q8_0:
559
+ extract_q8_0_data(&temp_tensor, weights, scales, zp);
560
+ break;
561
+ case GGML_TYPE_Q6_K:
562
+ extract_q6_k_data(&temp_tensor, weights, scales, zp);
563
+ break;
564
+ case GGML_TYPE_Q5_K:
565
+ extract_q5_k_data(&temp_tensor, weights, scales, zp, use_bias);
566
+ break;
567
+ default:
568
+ throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
569
+ }
570
+
571
+ // Create the OpenVINO weight subgraph
572
+ ov::Output<ov::Node> weight_node;
573
+ if (is_u4) {
574
+ weight_node = make_int4_weights(weights, scales, zp, weights_per_block, use_bias);
575
+ } else {
576
+ weight_node = make_int8_weights(weights, scales, zp, weights_per_block, use_bias);
577
+ }
578
+
579
+ auto result = weight_node.get_node_shared_ptr();
580
+ result->set_friendly_name(tensor->name);
581
+ return result;
582
+ }
583
+
584
+ // Requantize weights to target format, writing to provided buffers
585
+ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
586
+ const void * data,
587
+ ExtraQuantType requant_type,
588
+ int64_t block_size,
589
+ ov::Tensor & weights,
590
+ ov::Tensor & scales,
591
+ ov::Tensor & zp) {
592
+ int64_t n_elements = ggml_nelements(tensor);
593
+
594
+ // First dequantize to F32
595
+ std::vector<float> weights_f32(n_elements);
596
+ ggml_get_type_traits(tensor->type)->to_float(data, weights_f32.data(), n_elements);
597
+
598
+ // Handle F16 case - just convert and create constant
599
+ if (requant_type == ExtraQuantType::F16) {
600
+ ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), n_elements);
601
+ auto result = std::make_shared<ov::op::v0::Constant>(weights);
602
+ result->set_friendly_name(tensor->name);
603
+ return result;
604
+ }
605
+
606
+ // Requantize to target quantized format
607
+ bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
608
+
609
+ if (is_u4) {
610
+ quantize_q4_0(weights_f32.data(), weights, scales, zp, n_elements, block_size);
611
+ } else if (requant_type == ExtraQuantType::Q8_1_C) {
612
+ quantize_q8_1(weights_f32.data(), weights, scales, zp, n_elements, block_size);
613
+ } else {
614
+ quantize_q8_0(weights_f32.data(), weights, scales, zp, n_elements, block_size);
615
+ }
616
+
617
+ // Create the OpenVINO weight subgraph
618
+ ov::Output<ov::Node> weight_node;
619
+ if (is_u4) {
620
+ weight_node = make_int4_weights(weights, scales, zp, block_size);
621
+ } else {
622
+ weight_node = make_int8_weights(weights, scales, zp, block_size);
623
+ }
624
+
625
+ auto result = weight_node.get_node_shared_ptr();
626
+ result->set_friendly_name(tensor->name);
627
+ return result;
628
+ }
629
+
630
+ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr, bool use_bias) {
631
+ GGML_ASSERT(tensor != nullptr);
632
+ GGML_ASSERT(data != nullptr);
633
+
634
+ OvWeight result;
635
+
636
+ // Get 2D shape for weights [rows, cols]
637
+ ov::Shape node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
638
+
639
+ // Handle F16/F32/BF16 weights
640
+ if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
641
+ ov::element::Type element_type;
642
+ switch (tensor->type) {
643
+ case GGML_TYPE_F32:
644
+ element_type = ov::element::f32;
645
+ break;
646
+ case GGML_TYPE_F16:
647
+ element_type = ov::element::f16;
648
+ break;
649
+ case GGML_TYPE_BF16:
650
+ element_type = ov::element::bf16;
651
+ break;
652
+ default:
653
+ OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path");
654
+ }
655
+
656
+ if (output_base_ptr && output_base_ptr != data) {
657
+ // Using external buffer - copy data and create shared-memory constant
658
+ size_t tensor_bytes = ggml_nbytes(tensor);
659
+ memcpy(output_base_ptr, data, tensor_bytes);
660
+ result.weights = ov::Tensor(element_type, node_shape, output_base_ptr);
661
+ } else {
662
+ result.weights = ov::Tensor(element_type, node_shape, data);
663
+ }
664
+ result.weight_node = std::make_shared<ov::op::v0::Constant>(result.weights);
665
+ return result;
666
+ }
667
+
668
+ // Handle quantized weights
669
+ if (!ggml_is_quantized(tensor->type)) {
670
+ OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
671
+ }
672
+
673
+ result.layout = ggml_openvino_get_extracted_layout(tensor, use_bias);
674
+ const auto & layout = result.layout;
675
+ if (layout.total_size == 0) {
676
+ OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
677
+ }
678
+
679
+ if (use_bias) {
680
+ OPENVINO_ASSERT(!layout.is_requant,
681
+ "use_bias is only used for test-backend-ops, which should not have requantization");
682
+ // bias node will be created on the fly and not use backend buffer
683
+ output_base_ptr = nullptr;
684
+ }
685
+
686
+ // F16 requant path - no separate scales/zp needed in result
687
+ if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
688
+ if (output_base_ptr) {
689
+ result.weights = ov::Tensor(ov::element::f16, node_shape,
690
+ static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
691
+ } else {
692
+ result.weights = ov::Tensor(ov::element::f16, node_shape);
693
+ }
694
+ ov::Tensor dummy_scales, dummy_zp; // Not used for F16
695
+ result.weight_node =
696
+ requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, result.weights, dummy_scales, dummy_zp);
697
+ return result;
698
+ }
699
+
700
+ // Quantized path (normal extraction or quantized requant)
701
+ // Create weight/scale/zp tensors - shared between both paths
702
+ ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
703
+ ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
704
+ ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
705
+
706
+ if (output_base_ptr) {
707
+ uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
708
+ result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
709
+ result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
710
+ result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
711
+ } else {
712
+ result.weights = ov::Tensor(weight_type, node_shape);
713
+ result.scales = ov::Tensor(ov::element::f16, scale_shape);
714
+ if (use_bias && !layout.is_symmetric) {
715
+ // bias only has effect for asymmetric quant
716
+ result.zp = ov::Tensor(ov::element::f16, zp_shape);
717
+ } else {
718
+ result.zp = ov::Tensor(weight_type, zp_shape);
719
+ }
720
+ }
721
+
722
+ if (layout.is_requant && layout.requant_type.has_value()) {
723
+ result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
724
+ result.weights, result.scales, result.zp);
725
+ } else {
726
+ result.weight_node =
727
+ extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp, use_bias);
728
+ }
729
+
730
+ return result;
731
+ }
732
+
733
+ void quantize_q4_0(const float * x,
734
+ ov::Tensor & weights_arr,
735
+ ov::Tensor & scales_arr,
736
+ ov::Tensor & zp_arr,
737
+ int64_t k,
738
+ int64_t qk) {
739
+ assert(k % qk == 0);
740
+ const int nb = k / qk;
741
+
742
+ auto * weights = static_cast<uint8_t *>(weights_arr.data());
743
+ auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
744
+ auto * zp = static_cast<uint8_t *>(zp_arr.data());
745
+ bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
746
+
747
+ // For Q4_0, zero point is always 8
748
+ if (is_scalar_zp) {
749
+ zp[0] = 8 | (8 << 4); // Pack two 4-bit values
750
+ }
751
+
752
+ for (int i = 0; i < nb; i++) {
753
+ float amax = 0.0f; // absolute max
754
+ float max = 0.0f;
755
+
756
+ for (int j = 0; j < qk; j++) {
757
+ const float v = x[i * qk + j];
758
+ if (amax < fabsf(v)) {
759
+ amax = fabsf(v);
760
+ max = v;
761
+ }
762
+ }
763
+
764
+ const float d = max / -8;
765
+
766
+ if (d == 0) {
767
+ scales[i] = ov::float16(1.0f);
768
+ // zp is already set to 8 for symmetric, or set per-block for asymmetric
769
+ if (!is_scalar_zp) {
770
+ if (i % 2 == 0) {
771
+ zp[i / 2] = 8;
772
+ } else {
773
+ zp[i / 2] |= (8 << 4);
774
+ }
775
+ }
776
+ memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
777
+ continue;
778
+ }
779
+
780
+ const float id = 1.0f / d;
781
+ scales[i] = ov::float16(d);
782
+ // For asymmetric quantization, store per-block zero points
783
+ if (!is_scalar_zp) {
784
+ if (i % 2 == 0) {
785
+ zp[i / 2] = 8;
786
+ } else {
787
+ zp[i / 2] |= (8 << 4);
788
+ }
789
+ }
790
+
791
+ for (int j = 0; j < qk / 2; ++j) {
792
+ const float x0 = x[i * qk + 2 * j] * id;
793
+ const float x1 = x[i * qk + 2 * j + 1] * id;
794
+ const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
795
+ const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
796
+ weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
797
+ }
798
+ }
799
+ }
800
+
801
+ void quantize_q8_0(const float * x,
802
+ ov::Tensor & weights_arr,
803
+ ov::Tensor & scales_arr,
804
+ ov::Tensor & zp_arr,
805
+ int64_t k,
806
+ int64_t qk) {
807
+ assert(k % qk == 0);
808
+ const int nb = k / qk;
809
+
810
+ auto * weights = static_cast<uint8_t *>(weights_arr.data());
811
+ auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
812
+ auto * zp = static_cast<uint8_t *>(zp_arr.data());
813
+ bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
814
+
815
+ // For Q8_0, zero point is always 128
816
+ if (is_scalar_zp) {
817
+ zp[0] = 128;
818
+ }
819
+
820
+ for (int i = 0; i < nb; i++) {
821
+ float amax = 0.0f; // absolute max
822
+
823
+ for (int j = 0; j < qk; j++) {
824
+ const float v = x[i * qk + j];
825
+ if (amax < fabsf(v)) {
826
+ amax = fabsf(v);
827
+ }
828
+ }
829
+
830
+ const float d = amax / 127.0f;
831
+ const float id = d ? 1.0f / d : 0.0f;
832
+ scales[i] = ov::float16(d);
833
+ // For asymmetric quantization, store per-block zero points
834
+ if (!is_scalar_zp) {
835
+ zp[i] = 128;
836
+ }
837
+
838
+ for (int j = 0; j < qk; ++j) {
839
+ const float x0 = x[i * qk + j] * id;
840
+ const int8_t xi0 = roundf(x0);
841
+ weights[i * qk + j] = (uint8_t) (xi0 + 128);
842
+ }
843
+ }
844
+ }
845
+
846
+ void quantize_q8_1(const float * x,
847
+ ov::Tensor & weights_arr,
848
+ ov::Tensor & scales_arr,
849
+ ov::Tensor & zp_arr,
850
+ int64_t k,
851
+ int64_t qk) {
852
+ assert(k % qk == 0);
853
+ const int nb = k / qk;
854
+
855
+ auto * weights = static_cast<uint8_t *>(weights_arr.data());
856
+ auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
857
+ auto * zp = static_cast<uint8_t *>(zp_arr.data());
858
+ for (int i = 0; i < nb; i++) {
859
+ float min = std::numeric_limits<float>::max();
860
+ float max = std::numeric_limits<float>::lowest();
861
+
862
+ for (int j = 0; j < qk; j++) {
863
+ const float v = x[i * qk + j];
864
+ if (v < min) {
865
+ min = v;
866
+ }
867
+ if (v > max) {
868
+ max = v;
869
+ }
870
+ }
871
+
872
+ const float d = (max - min) / ((1 << 8) - 1);
873
+ const float id = d ? 1.0f / d : 0.0f;
874
+ scales[i] = ov::float16(d);
875
+ // zp = -min / scale (Q8_1 is asymmetric)
876
+ zp[i] = (d != 0.0f) ? (uint8_t) std::round(-min / d) : 0;
877
+
878
+ for (int j = 0; j < qk; ++j) {
879
+ const float x0 = (x[i * qk + j] - min) * id;
880
+ const uint8_t xi0 = roundf(x0);
881
+ weights[i * qk + j] = xi0;
882
+ }
883
+ }
884
+ }