whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (610) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +99 -2
  4. data/ext/extconf.rb +1 -0
  5. data/ext/ruby_whisper.c +20 -4
  6. data/ext/ruby_whisper.h +30 -2
  7. data/ext/ruby_whisper_context.c +216 -124
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +0 -1
  10. data/ext/ruby_whisper_params.c +0 -1
  11. data/ext/ruby_whisper_segment.c +0 -1
  12. data/ext/ruby_whisper_token.c +29 -9
  13. data/ext/ruby_whisper_transcribe.cpp +4 -1
  14. data/ext/ruby_whisper_vad_context.c +48 -1
  15. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +0 -1
  18. data/ext/ruby_whisper_vad_segments.c +0 -1
  19. data/ext/sources/CMakeLists.txt +1 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  22. data/ext/sources/examples/bench/bench.cpp +23 -18
  23. data/ext/sources/examples/cli/cli.cpp +8 -0
  24. data/ext/sources/examples/common-ggml.cpp +2 -0
  25. data/ext/sources/examples/miniaudio.h +4507 -2131
  26. data/ext/sources/examples/server/server.cpp +18 -4
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
  31. data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
  33. data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
  34. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  35. data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
  36. data/ext/sources/examples/talk-llama/llama-context.h +27 -28
  37. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
  38. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
  40. data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
  41. data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
  42. data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
  43. data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
  44. data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
  45. data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
  49. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  50. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  51. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
  52. data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
  53. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
  54. data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
  55. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
  56. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  57. data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
  58. data/ext/sources/examples/talk-llama/llama-model.h +72 -19
  59. data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
  60. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
  61. data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
  62. data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
  63. data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
  64. data/ext/sources/examples/talk-llama/llama.cpp +76 -22
  65. data/ext/sources/examples/talk-llama/llama.h +63 -30
  66. data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
  67. data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
  68. data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
  69. data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
  70. data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
  71. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
  72. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
  73. data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
  74. data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
  75. data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
  76. data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
  77. data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
  78. data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
  79. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
  80. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
  81. data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
  82. data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
  83. data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
  84. data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
  85. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
  86. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  87. data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
  88. data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
  89. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
  90. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
  91. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  92. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
  108. data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
  109. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
  110. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
  111. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
  112. data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
  113. data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
  114. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
  116. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  117. data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
  118. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
  119. data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
  120. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
  121. data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
  122. data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
  123. data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
  124. data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
  125. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
  126. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
  127. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
  128. data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
  129. data/ext/sources/examples/talk-llama/models/models.h +181 -46
  130. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
  131. data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
  132. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
  133. data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
  134. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
  135. data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
  136. data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
  137. data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
  138. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
  139. data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
  140. data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
  141. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  142. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
  143. data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
  144. data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
  145. data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
  146. data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
  147. data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
  148. data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
  149. data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
  150. data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
  151. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
  152. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
  153. data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
  154. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  155. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  156. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
  157. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
  158. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
  159. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
  160. data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
  161. data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
  162. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
  163. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
  164. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
  165. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
  166. data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
  167. data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
  168. data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
  169. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
  170. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  171. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
  172. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
  173. data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
  174. data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
  175. data/ext/sources/ggml/CMakeLists.txt +9 -3
  176. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  177. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  179. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  180. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  181. data/ext/sources/ggml/include/ggml-rpc.h +6 -1
  182. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  183. data/ext/sources/ggml/include/ggml.h +56 -9
  184. data/ext/sources/ggml/src/CMakeLists.txt +3 -0
  185. data/ext/sources/ggml/src/ggml-alloc.c +4 -9
  186. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  187. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  188. data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
  189. data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  191. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
  192. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  193. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  194. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
  195. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
  196. data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
  197. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
  198. data/ext/sources/ggml/src/ggml-common.h +11 -0
  199. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
  200. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
  201. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  202. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  203. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
  204. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
  205. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  208. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  209. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
  210. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
  211. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  212. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  213. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
  214. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
  215. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
  216. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  217. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
  218. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
  219. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
  220. data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
  221. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  222. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  223. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
  224. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  225. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  226. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
  227. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  228. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
  229. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
  231. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
  232. data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
  233. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
  235. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
  236. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
  237. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
  238. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
  239. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  241. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
  242. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  243. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  244. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
  245. data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
  246. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
  247. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  248. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
  249. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
  250. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
  251. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  252. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
  253. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
  254. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  255. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  256. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
  257. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  258. data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
  259. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  260. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  261. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
  262. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  269. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
  270. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
  271. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
  272. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
  274. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
  278. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  297. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  317. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  321. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
  322. data/ext/sources/ggml/src/ggml-impl.h +62 -0
  323. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  324. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  325. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  326. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  327. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
  328. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
  329. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
  330. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
  335. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
  336. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  367. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  368. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  369. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  370. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  371. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  372. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  373. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  374. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  375. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  376. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  377. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  378. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  379. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  380. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  381. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  382. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  383. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  384. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  385. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  386. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  387. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  388. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  389. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  390. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  391. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  392. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  393. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  394. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  395. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  396. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  397. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  398. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  399. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  400. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  401. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  402. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  403. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  404. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  405. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  406. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  407. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  412. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  413. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  414. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  415. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  416. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
  417. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  418. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  420. data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
  421. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
  422. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  423. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  424. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  425. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
  426. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  427. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  428. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  429. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  430. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  431. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  432. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  433. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  434. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  435. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
  436. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  437. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  438. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  439. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  440. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  441. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  442. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  443. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  444. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  445. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  446. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  447. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  448. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  449. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  450. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  451. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  452. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  453. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  454. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  455. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  456. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  457. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  458. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  459. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  460. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  461. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  462. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  463. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  464. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  465. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  466. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  467. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  468. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  469. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  470. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  471. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  472. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  473. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  474. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  475. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  476. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  477. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  478. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  479. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  480. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  481. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  482. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  483. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  484. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  485. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  486. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  487. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  488. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
  489. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  490. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  491. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  492. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  493. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  494. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  495. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  496. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  497. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  498. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  499. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  500. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  501. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  502. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  503. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  504. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  505. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  506. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  507. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  508. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  509. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  510. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  511. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  512. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  513. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  514. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  515. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  516. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  517. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  518. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  519. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  520. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  521. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  522. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  523. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  524. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  525. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  526. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  527. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  528. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  529. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  530. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  531. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  532. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
  533. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
  560. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
  561. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
  562. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  563. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  564. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  565. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  566. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  567. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  568. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
  569. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  570. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
  571. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
  572. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  573. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
  574. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  584. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  585. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  586. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
  587. data/ext/sources/ggml/src/ggml.c +167 -33
  588. data/ext/sources/ggml/src/gguf.cpp +229 -44
  589. data/ext/sources/src/whisper.cpp +6 -28
  590. data/sig/whisper.rbs +43 -2
  591. data/test/test_context_params.rb +82 -0
  592. data/test/test_token.rb +11 -0
  593. data/test/test_vad_context.rb +58 -8
  594. data/test/test_whisper.rb +20 -0
  595. data/whispercpp.gemspec +1 -1
  596. metadata +240 -28
  597. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  598. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  599. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  600. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  601. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  602. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  603. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  604. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  605. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  606. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  607. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  608. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  609. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  610. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
@@ -48,6 +48,90 @@ static inline int nearest_int(float fval) {
48
48
 
49
49
  extern "C" {
50
50
 
51
+ #if defined __riscv_zvfh
52
+ void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
53
+ assert(QK8_0 == 32);
54
+ assert(k % QK8_0 == 0);
55
+ const int nb = k / QK8_0;
56
+
57
+ block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
58
+
59
+ // scalar
60
+ const int blck_size_interleave = 1;
61
+ float srcv[4][QK8_0];
62
+ float id[4];
63
+
64
+ for (int i = 0; i < nb; i++) {
65
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
66
+ float amax = 0.0f; // absolute max
67
+
68
+ for (int j = 0; j < QK8_0; j++) {
69
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
70
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
71
+ }
72
+
73
+ const float d = amax / ((1 << 7) - 1);
74
+ id[row_iter] = d ? 1.0f / d : 0.0f;
75
+
76
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
77
+ }
78
+
79
+ for (int j = 0; j < QK8_0 * 4; j++) {
80
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
81
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
82
+ src_offset += (j % blck_size_interleave);
83
+
84
+ float x0 = srcv[src_id][src_offset] * id[src_id];
85
+ y[i].qs[j] = roundf(x0);
86
+ }
87
+ }
88
+ }
89
+
90
+ void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
91
+ assert(QK_K == 256);
92
+ assert(k % QK_K == 0);
93
+ const int nb = k / QK_K;
94
+
95
+ block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
96
+
97
+ const int blck_size_interleave = 1;
98
+ float srcv[4][QK_K];
99
+ float iscale[4];
100
+
101
+ for (int i = 0; i < nb; i++) {
102
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
103
+ float amax = 0.0f; // absolute max
104
+ float max = 0;
105
+
106
+ for (int j = 0; j < QK_K; j++) {
107
+ srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
108
+ // Update the maximum value of the corresponding super block
109
+ if(amax < fabsf(srcv[row_iter][j])) {
110
+ amax = fabsf(srcv[row_iter][j]);
111
+ max = srcv[row_iter][j];
112
+ }
113
+ }
114
+
115
+ iscale[row_iter] = amax ? -127.f/max : 0;
116
+ y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
117
+ }
118
+
119
+ for (int j = 0; j < QK_K / 4; j++) {
120
+ y[i].bsums[j] = 0;
121
+ }
122
+ for (int j = 0; j < QK_K * 4; j++) {
123
+ int src_id = j % 4;
124
+ int src_offset = j / 4;
125
+ int index = ((j >> 6) << 2) + (j & 3);
126
+
127
+ float x0 = srcv[src_id][src_offset] * iscale[src_id];
128
+ y[i].qs[j] = nearest_int(x0);
129
+ y[i].bsums[index] += y[i].qs[j];
130
+ }
131
+ }
132
+ }
133
+ #endif
134
+
51
135
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
52
136
  assert(QK8_0 == 32);
53
137
  assert(k % QK8_0 == 0);
@@ -124,7 +208,6 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
124
208
  }
125
209
  }
126
210
 
127
-
128
211
  void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
129
212
  assert(QK_K == 256);
130
213
  assert(k % QK_K == 0);
@@ -256,192 +339,289 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTR
256
339
  ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
257
340
  }
258
341
 
259
- extern "C" {
342
+ #if defined __riscv_zvfh
343
+ template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
344
+ assert(nrow == 4);
345
+ UNUSED(nrow);
346
+ ggml_quantize_mat_q8_0_4x1(x, vy, n_per_row);
347
+ }
260
348
 
261
- void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
262
- const int qk = QK8_0;
263
- const int nb = n / qk;
264
- const int ncols_interleaved = 4;
265
- const int blocklen = 4;
349
+ template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
350
+ assert(nrow == 4);
351
+ UNUSED(nrow);
352
+ ggml_quantize_mat_q8_K_4x1(x, vy, n_per_row);
353
+ }
354
+ #endif
355
+
356
+ template <int M, int N>
357
+ static void ggml_gemv_q6_K_NxM_q8_K_generic_impl(int n,
358
+ float * GGML_RESTRICT s,
359
+ size_t bs,
360
+ const void * GGML_RESTRICT vx,
361
+ const void * GGML_RESTRICT vy,
362
+ int nr,
363
+ int nc) {
364
+ constexpr int blocklen = M;
365
+ constexpr int ncols_interleaved = N;
366
+ const int qk = QK_K;
367
+ const int nb = n / qk;
368
+ const int blocks_per_half = 64 / blocklen;
266
369
 
267
- assert(nr == 1);
268
370
  assert(n % qk == 0);
269
371
  assert(nc % ncols_interleaved == 0);
270
372
 
271
- UNUSED(s);
272
373
  UNUSED(bs);
273
- UNUSED(vx);
274
- UNUSED(vy);
275
374
  UNUSED(nr);
276
- UNUSED(nc);
277
- UNUSED(nb);
278
- UNUSED(ncols_interleaved);
279
- UNUSED(blocklen);
280
375
 
281
- float sumf[4];
282
- int sumi;
376
+ float sumf[8];
283
377
 
284
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
378
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
285
379
  for (int x = 0; x < nc / ncols_interleaved; x++) {
286
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
380
+ const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
381
+
382
+ for (int j = 0; j < ncols_interleaved; j++) {
383
+ sumf[j] = 0.0f;
384
+ }
287
385
 
288
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
289
386
  for (int l = 0; l < nb; l++) {
290
387
  for (int k = 0; k < (qk / (2 * blocklen)); k++) {
388
+ const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
389
+ const int base_h = base_l + 64;
390
+
391
+ const int scale_idx_l = base_l / 16;
392
+ const int scale_idx_h = base_h / 16;
393
+
394
+ const int qh_shift_l = ((base_l % 128) / 32) * 2;
395
+ const int qh_shift_h = ((base_h % 128) / 32) * 2;
396
+
397
+ const int qh_half_l = (base_l / 128) * 32;
398
+ const int qh_half_h = (base_h / 128) * 32;
399
+
291
400
  for (int j = 0; j < ncols_interleaved; j++) {
292
- sumi = 0;
293
- for (int i = 0; i < blocklen; ++i) {
294
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
295
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
296
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
401
+ const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
402
+ const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
403
+
404
+ int sumi_l = 0;
405
+ int sumi_h = 0;
406
+
407
+ for (int i = 0; i < blocklen; i++) {
408
+ const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
409
+ const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
410
+ const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
411
+
412
+ const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
413
+ const int qh_chunk_l = qh_idx_l / blocklen;
414
+ const int qh_pos_l = qh_idx_l % blocklen;
415
+ const int qh_offset_l = qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
416
+ const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
417
+
418
+ const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
419
+ const int qh_chunk_h = qh_idx_h / blocklen;
420
+ const int qh_pos_h = qh_idx_h % blocklen;
421
+ const int qh_offset_h = qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
422
+ const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
423
+
424
+ const int q_l = ((hi_2_l << 4) | l_4) - 32;
425
+ const int q_h = ((hi_2_h << 4) | hi_4) - 32;
426
+
427
+ const int8_t a_l = a_ptr[l].qs[base_l + i];
428
+ const int8_t a_h = a_ptr[l].qs[base_h + i];
429
+
430
+ sumi_l += q_l * a_l;
431
+ sumi_h += q_h * a_h;
297
432
  }
298
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
433
+
434
+ sumf[j] +=
435
+ (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
299
436
  }
300
437
  }
301
438
  }
302
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
439
+
440
+ for (int j = 0; j < ncols_interleaved; j++) {
441
+ s[x * ncols_interleaved + j] = sumf[j];
442
+ }
303
443
  }
304
444
  }
305
445
 
306
- void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
307
- const int qk = QK8_0;
308
- const int nb = n / qk;
309
- const int ncols_interleaved = 4;
310
- const int blocklen = 8;
446
+ template <int M, int N>
447
+ static void ggml_gemm_q6_K_NxM_q8_K_generic_impl(int n,
448
+ float * GGML_RESTRICT s,
449
+ size_t bs,
450
+ const void * GGML_RESTRICT vx,
451
+ const void * GGML_RESTRICT vy,
452
+ int nr,
453
+ int nc) {
454
+ constexpr int blocklen = M;
455
+ constexpr int ncols_interleaved = N;
456
+ const int qk = QK_K;
457
+ const int nb = n / qk;
458
+ const int blocks_per_half = 64 / blocklen;
459
+ const int q8_half_stride = 512;
460
+ const int q8_low_high_step = 256;
311
461
 
312
- assert (n % qk == 0);
313
- assert (nc % ncols_interleaved == 0);
462
+ assert(n % qk == 0);
463
+ assert(nr % 4 == 0);
464
+ assert(nc % ncols_interleaved == 0);
314
465
 
315
- UNUSED(s);
316
466
  UNUSED(bs);
317
- UNUSED(vx);
318
- UNUSED(vy);
319
- UNUSED(nr);
320
- UNUSED(nc);
321
- UNUSED(nb);
322
- UNUSED(ncols_interleaved);
323
- UNUSED(blocklen);
324
467
 
325
- float sumf[4];
326
- int sumi;
468
+ float sumf[4][8];
327
469
 
328
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
329
- for (int x = 0; x < nc / ncols_interleaved; x++) {
330
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
470
+ for (int y = 0; y < nr / 4; y++) {
471
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
472
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
473
+ const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
331
474
 
332
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
333
- for (int l = 0; l < nb; l++) {
334
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
475
+ for (int m = 0; m < 4; m++) {
335
476
  for (int j = 0; j < ncols_interleaved; j++) {
336
- sumi = 0;
337
- for (int i = 0; i < blocklen; ++i) {
338
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
339
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
340
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
341
- }
342
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
477
+ sumf[m][j] = 0.0f;
343
478
  }
344
479
  }
345
- }
346
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
347
- }
348
- }
349
480
 
350
- void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
351
- const int qk = QK8_0;
352
- const int nb = n / qk;
353
- const int ncols_interleaved = 8;
354
- const int blocklen = 8;
481
+ for (int l = 0; l < nb; l++) {
482
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
483
+ const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
484
+ const int base_h = base_l + 64;
355
485
 
356
- assert (n % qk == 0);
357
- assert (nc % ncols_interleaved == 0);
486
+ const int scale_idx_l = base_l / 16;
487
+ const int scale_idx_h = base_h / 16;
358
488
 
359
- UNUSED(s);
360
- UNUSED(bs);
361
- UNUSED(vx);
362
- UNUSED(vy);
363
- UNUSED(nr);
364
- UNUSED(nc);
365
- UNUSED(nb);
366
- UNUSED(ncols_interleaved);
367
- UNUSED(blocklen);
489
+ const int qh_shift_l = ((base_l % 128) / 32) * 2;
490
+ const int qh_shift_h = ((base_h % 128) / 32) * 2;
368
491
 
369
- float sumf[8];
370
- int sumi;
492
+ const int qh_half_l = (base_l / 128) * 32;
493
+ const int qh_half_h = (base_h / 128) * 32;
371
494
 
372
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
373
- for (int x = 0; x < nc / ncols_interleaved; x++) {
374
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
495
+ const int q8_base = (k / blocks_per_half) * q8_half_stride + (k % blocks_per_half) * (blocklen * 4);
375
496
 
376
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
377
- for (int l = 0; l < nb; l++) {
378
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
379
- for (int j = 0; j < ncols_interleaved; j++) {
380
- sumi = 0;
381
- for (int i = 0; i < blocklen; ++i) {
382
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
383
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
384
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
497
+ for (int m = 0; m < 4; m++) {
498
+ for (int j = 0; j < ncols_interleaved; j++) {
499
+ const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
500
+ const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
501
+
502
+ int sumi_l = 0;
503
+ int sumi_h = 0;
504
+
505
+ for (int i = 0; i < blocklen; i++) {
506
+ const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
507
+ const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
508
+ const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
509
+
510
+ const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
511
+ const int qh_chunk_l = qh_idx_l / blocklen;
512
+ const int qh_pos_l = qh_idx_l % blocklen;
513
+ const int qh_offset_l =
514
+ qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
515
+ const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
516
+
517
+ const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
518
+ const int qh_chunk_h = qh_idx_h / blocklen;
519
+ const int qh_pos_h = qh_idx_h % blocklen;
520
+ const int qh_offset_h =
521
+ qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
522
+ const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
523
+
524
+ const int q_l = ((hi_2_l << 4) | l_4) - 32;
525
+ const int q_h = ((hi_2_h << 4) | hi_4) - 32;
526
+
527
+ const int8_t q8_l = a_ptr[l].qs[q8_base + m * blocklen + i];
528
+ const int8_t q8_h = a_ptr[l].qs[q8_base + m * blocklen + i + q8_low_high_step];
529
+
530
+ sumi_l += q_l * q8_l;
531
+ sumi_h += q_h * q8_h;
532
+ }
533
+
534
+ sumf[m][j] += (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) *
535
+ a_ptr[l].d[m];
536
+ }
385
537
  }
386
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
538
+ }
539
+ }
540
+
541
+ for (int m = 0; m < 4; m++) {
542
+ for (int j = 0; j < ncols_interleaved; j++) {
543
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
387
544
  }
388
545
  }
389
546
  }
390
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
391
547
  }
392
548
  }
393
549
 
394
- void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
395
- const int qk = QK_K;
396
- const int nb = n / qk;
397
- const int ncols_interleaved = 8;
398
- const int blocklen = 4;
399
- static const uint32_t kmask1 = 0x3f3f3f3f;
400
- static const uint32_t kmask2 = 0x0f0f0f0f;
401
- static const uint32_t kmask3 = 0x03030303;
550
+ template <int M, int N>
551
+ static void ggml_gemv_q5_K_NxM_q8_K_generic_impl(int n,
552
+ float * GGML_RESTRICT s,
553
+ size_t bs,
554
+ const void * GGML_RESTRICT vx,
555
+ const void * GGML_RESTRICT vy,
556
+ int nr,
557
+ int nc) {
558
+ constexpr int blocklen = M;
559
+ constexpr int ncols_interleaved = N;
560
+ const int qk = QK_K;
561
+ const int nb = n / qk;
562
+ static const uint32_t kmask1 = 0x3f3f3f3f;
563
+ static const uint32_t kmask2 = 0x0f0f0f0f;
564
+ static const uint32_t kmask3 = 0x03030303;
402
565
 
403
- assert (n % qk == 0);
404
- assert (nc % ncols_interleaved == 0);
566
+ assert(n % qk == 0);
567
+ assert(nc % ncols_interleaved == 0);
405
568
 
406
569
  UNUSED(bs);
407
570
  UNUSED(nr);
408
571
 
409
- float sumf[8];
410
- float sum_minf[8];
572
+ float sumf[ncols_interleaved];
573
+ float sum_minf[ncols_interleaved];
411
574
  uint32_t utmp[32];
412
- int sumi1;
413
- int sumi2;
414
- int sumi;
575
+ int sumi1;
576
+ int sumi2;
577
+ int sumi;
415
578
 
416
579
  const block_q8_K * a_ptr = (const block_q8_K *) vy;
417
580
  for (int x = 0; x < nc / ncols_interleaved; x++) {
418
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
581
+ const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
419
582
 
420
583
  for (int j = 0; j < ncols_interleaved; j++) {
421
- sumf[j] = 0.0;
584
+ sumf[j] = 0.0;
422
585
  sum_minf[j] = 0.0;
423
586
  }
424
587
  for (int l = 0; l < nb; l++) {
425
588
  for (int sb = 0; sb < 8; sb++) {
426
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
427
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
589
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
590
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
428
591
  const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
429
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
430
- utmp[sb * 4 + 2] = uaux_0;
592
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
593
+ utmp[sb * 4 + 2] = uaux_0;
431
594
  utmp[sb * 4 + 0] &= kmask1;
432
595
  }
433
596
  for (int k = 0; k < (qk / (2 * blocklen)); k++) {
434
- uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
435
- uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
597
+ constexpr int scale_stride = 32;
598
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
599
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
600
+
601
+ const int qh_shift = (k / (32 / blocklen)) * 2;
436
602
  for (int j = 0; j < ncols_interleaved; j++) {
437
603
  sumi1 = 0;
438
604
  sumi2 = 0;
439
- sumi = 0;
605
+ sumi = 0;
440
606
  for (int i = 0; i < blocklen; ++i) {
441
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
442
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
443
- sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
444
- sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
607
+ const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
608
+
609
+ const int qh_idx = (k * blocklen + i) % 32;
610
+ const int qh_chunk = qh_idx / blocklen;
611
+ const int qh_pos = qh_idx % blocklen;
612
+ const int b_qh_offset = qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
613
+
614
+ const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
615
+ const uint8_t h0 = (qh_val >> qh_shift) & 1;
616
+ const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
617
+
618
+ const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
619
+ const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
620
+
621
+ const int q8_offset = (k / (32 / blocklen)) * 64 + (k % (32 / blocklen)) * blocklen + i;
622
+
623
+ sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
624
+ sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
445
625
  sumi1 = sumi1 * scales_0[j];
446
626
  sumi2 = sumi2 * scales_1[j];
447
627
  sumi += sumi1 + sumi2;
@@ -452,7 +632,8 @@ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
452
632
  for (int sb = 0; sb < 8; sb++) {
453
633
  uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
454
634
  for (int j = 0; j < ncols_interleaved; j++) {
455
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
635
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
636
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
456
637
  }
457
638
  }
458
639
  }
@@ -462,17 +643,123 @@ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
462
643
  }
463
644
  }
464
645
 
465
- void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
466
- const int qk = QK_K;
646
+ template <int M, int N>
647
+ static void ggml_gemm_q5_K_NxM_q8_K_generic_impl(int n,
648
+ float * GGML_RESTRICT s,
649
+ size_t bs,
650
+ const void * GGML_RESTRICT vx,
651
+ const void * GGML_RESTRICT vy,
652
+ int nr,
653
+ int nc) {
654
+ constexpr int blocklen = M;
655
+ constexpr int ncols_interleaved = N;
656
+ const int qk = QK_K;
657
+ const int nb = n / qk;
658
+ static const uint32_t kmask1 = 0x3f3f3f3f;
659
+ static const uint32_t kmask2 = 0x0f0f0f0f;
660
+ static const uint32_t kmask3 = 0x03030303;
661
+
662
+ assert(n % qk == 0);
663
+ assert(nr % 4 == 0);
664
+ assert(nc % ncols_interleaved == 0);
665
+
666
+ float sumf[4][ncols_interleaved];
667
+ float sum_minf[4][ncols_interleaved];
668
+ uint32_t utmp[32];
669
+ int sumi1;
670
+ int sumi2;
671
+ int sumi;
672
+
673
+ for (int y = 0; y < nr / 4; y++) {
674
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
675
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
676
+ const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
677
+ for (int m = 0; m < 4; m++) {
678
+ for (int j = 0; j < ncols_interleaved; j++) {
679
+ sumf[m][j] = 0.0;
680
+ sum_minf[m][j] = 0.0;
681
+ }
682
+ }
683
+ for (int l = 0; l < nb; l++) {
684
+ for (int sb = 0; sb < 8; sb++) {
685
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
686
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
687
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
688
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
689
+ utmp[sb * 4 + 2] = uaux_0;
690
+ utmp[sb * 4 + 0] &= kmask1;
691
+ }
692
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
693
+ constexpr int scale_stride = 32;
694
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
695
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
696
+
697
+ const int qh_shift = (k / (32 / blocklen)) * 2;
698
+ for (int m = 0; m < 4; m++) {
699
+ for (int j = 0; j < ncols_interleaved; j++) {
700
+ sumi1 = 0;
701
+ sumi2 = 0;
702
+ sumi = 0;
703
+ for (int i = 0; i < blocklen; ++i) {
704
+ const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
705
+
706
+ const int qh_idx = (k * blocklen + i) % 32;
707
+ const int qh_chunk = qh_idx / blocklen;
708
+ const int qh_pos = qh_idx % blocklen;
709
+ const int b_qh_offset =
710
+ qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
711
+
712
+ const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
713
+ const uint8_t h0 = (qh_val >> qh_shift) & 1;
714
+ const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
715
+
716
+ const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
717
+ const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
718
+
719
+ const int q8_offset = (k / (32 / blocklen)) * 256 +
720
+ (k % (32 / blocklen)) * 4 * blocklen + m * blocklen + i;
721
+
722
+ sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
723
+ sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
724
+ sumi1 = sumi1 * scales_0[j];
725
+ sumi2 = sumi2 * scales_1[j];
726
+ sumi += sumi1 + sumi2;
727
+ }
728
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
729
+ }
730
+ }
731
+ }
732
+ for (int sb = 0; sb < 8; sb++) {
733
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
734
+ for (int m = 0; m < 4; m++) {
735
+ const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
736
+ for (int j = 0; j < ncols_interleaved; j++) {
737
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
738
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
739
+ }
740
+ }
741
+ }
742
+ }
743
+ for (int m = 0; m < 4; m++) {
744
+ for (int j = 0; j < ncols_interleaved; j++) {
745
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
746
+ }
747
+ }
748
+ }
749
+ }
750
+ }
751
+
752
+ extern "C" {
753
+
754
+ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
755
+ const int qk = QK8_0;
467
756
  const int nb = n / qk;
468
- const int ncols_interleaved = 8;
469
- const int blocklen = 8;
470
- static const uint32_t kmask1 = 0x3f3f3f3f;
471
- static const uint32_t kmask2 = 0x0f0f0f0f;
472
- static const uint32_t kmask3 = 0x03030303;
757
+ const int ncols_interleaved = 4;
758
+ const int blocklen = 4;
473
759
 
474
- assert (n % qk == 0);
475
- assert (nc % ncols_interleaved == 0);
760
+ assert(nr == 1);
761
+ assert(n % qk == 0);
762
+ assert(nc % ncols_interleaved == 0);
476
763
 
477
764
  UNUSED(s);
478
765
  UNUSED(bs);
@@ -484,66 +771,35 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
484
771
  UNUSED(ncols_interleaved);
485
772
  UNUSED(blocklen);
486
773
 
487
- float sumf[8];
488
- float sum_minf[8];
489
- uint32_t utmp[32];
490
- int sumi1;
491
- int sumi2;
774
+ float sumf[4];
492
775
  int sumi;
493
776
 
494
- const block_q8_K * a_ptr = (const block_q8_K *) vy;
777
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
495
778
  for (int x = 0; x < nc / ncols_interleaved; x++) {
496
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
779
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
497
780
 
498
- for (int j = 0; j < ncols_interleaved; j++) {
499
- sumf[j] = 0.0;
500
- sum_minf[j] = 0.0;
501
- }
781
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
502
782
  for (int l = 0; l < nb; l++) {
503
- for (int sb = 0; sb < 8; sb++) {
504
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
505
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
506
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
507
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
508
- utmp[sb * 4 + 2] = uaux_0;
509
- utmp[sb * 4 + 0] &= kmask1;
510
- }
511
783
  for (int k = 0; k < (qk / (2 * blocklen)); k++) {
512
- uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
513
- uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
514
784
  for (int j = 0; j < ncols_interleaved; j++) {
515
- sumi1 = 0;
516
- sumi2 = 0;
517
785
  sumi = 0;
518
786
  for (int i = 0; i < blocklen; ++i) {
519
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
520
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
521
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
522
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
523
- sumi1 = sumi1 * scales_0[j];
524
- sumi2 = sumi2 * scales_1[j];
525
- sumi += sumi1 + sumi2;
787
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
788
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
789
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
526
790
  }
527
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
528
- }
529
- }
530
- for (int sb = 0; sb < 8; sb++) {
531
- uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
532
- for (int j = 0; j < ncols_interleaved; j++) {
533
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
791
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
534
792
  }
535
793
  }
536
794
  }
537
- for (int j = 0; j < ncols_interleaved; j++) {
538
- s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
539
- }
795
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
540
796
  }
541
797
  }
542
798
 
543
- void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
544
- const int qk = QK_K;
799
+ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
800
+ const int qk = QK8_0;
545
801
  const int nb = n / qk;
546
- const int ncols_interleaved = 8;
802
+ const int ncols_interleaved = 4;
547
803
  const int blocklen = 8;
548
804
 
549
805
  assert (n % qk == 0);
@@ -559,82 +815,56 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
559
815
  UNUSED(ncols_interleaved);
560
816
  UNUSED(blocklen);
561
817
 
562
- float sumf[8];
563
- float sum_minf[8];
564
- int sumi1,sumi2,sumi3,sumi4;
818
+ float sumf[4];
565
819
  int sumi;
566
820
 
567
- const block_q8_K * a_ptr = (const block_q8_K *)vy;
568
- for(int x = 0; x < nc / ncols_interleaved; x++) {
569
- const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
570
- for (int j = 0; j < ncols_interleaved; j++) {
571
- sumf[j] = 0.0;
572
- sum_minf[j] = 0.0;
573
- }
821
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
822
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
823
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
824
+
825
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
574
826
  for (int l = 0; l < nb; l++) {
575
- for (int k = 0; k < (qk / (4 * blocklen)); k++) {
576
- const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
577
- const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
578
- const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
579
- const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
827
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
580
828
  for (int j = 0; j < ncols_interleaved; j++) {
581
- sumi1 = 0;
582
- sumi2 = 0;
583
- sumi3 = 0;
584
- sumi4 = 0;
585
829
  sumi = 0;
586
- int offset = ((k / 2) % 2) + j * 2;
587
- for (int i = 0; i < blocklen; ++i){
588
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
589
- const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
590
- const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
591
- const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
592
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
593
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
594
- sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
595
- sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
596
-
597
- sumi1 = sumi1 * (scales_0[offset] & 0xF);
598
- sumi2 = sumi2 * (scales_1[offset] & 0xF);
599
- sumi3 = sumi3 * (scales_2[offset] & 0xF);
600
- sumi4 = sumi4 * (scales_3[offset] & 0xF);
601
- sumi += sumi1 + sumi2 + sumi3 + sumi4;
830
+ for (int i = 0; i < blocklen; ++i) {
831
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
832
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
833
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
602
834
  }
603
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
604
- }
605
- }
606
- for(int sb = 0; sb < 8; sb++) {
607
- const uint8_t *mins = b_ptr[l].scales + sb * 16;
608
- for(int j = 0; j < ncols_interleaved; j++){
609
- sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
835
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
610
836
  }
611
837
  }
612
838
  }
613
- for (int j = 0; j < ncols_interleaved; j++) {
614
- s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
615
- }
839
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
616
840
  }
617
841
  }
618
842
 
619
- void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
843
+ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
620
844
  const int qk = QK8_0;
621
845
  const int nb = n / qk;
622
- const int ncols_interleaved = 4;
623
- const int blocklen = 4;
846
+ const int ncols_interleaved = 8;
847
+ const int blocklen = 8;
624
848
 
625
- assert(nr == 1);
626
- assert(n % qk == 0);
627
- assert(nc % ncols_interleaved == 0);
849
+ assert (n % qk == 0);
850
+ assert (nc % ncols_interleaved == 0);
628
851
 
852
+ UNUSED(s);
629
853
  UNUSED(bs);
854
+ UNUSED(vx);
855
+ UNUSED(vy);
630
856
  UNUSED(nr);
857
+ UNUSED(nc);
858
+ UNUSED(nb);
859
+ UNUSED(ncols_interleaved);
860
+ UNUSED(blocklen);
631
861
 
632
- float sumf[4];
862
+ float sumf[8];
633
863
  int sumi;
634
864
 
635
865
  const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
636
866
  for (int x = 0; x < nc / ncols_interleaved; x++) {
637
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
867
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
638
868
 
639
869
  for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
640
870
  for (int l = 0; l < nb; l++) {
@@ -642,9 +872,9 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
642
872
  for (int j = 0; j < ncols_interleaved; j++) {
643
873
  sumi = 0;
644
874
  for (int i = 0; i < blocklen; ++i) {
645
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
646
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
647
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
875
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
876
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
877
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
648
878
  }
649
879
  sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
650
880
  }
@@ -654,139 +884,1210 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
654
884
  }
655
885
  }
656
886
 
657
- void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
658
- const int qk = QK8_0;
887
+ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
888
+ const int qk = QK_K;
659
889
  const int nb = n / qk;
660
890
  const int ncols_interleaved = 8;
661
- const int blocklen = 8;
891
+ const int blocklen = 4;
892
+ static const uint32_t kmask1 = 0x3f3f3f3f;
893
+ static const uint32_t kmask2 = 0x0f0f0f0f;
894
+ static const uint32_t kmask3 = 0x03030303;
662
895
 
663
- assert(nr == 1);
664
- assert(n % qk == 0);
665
- assert(nc % ncols_interleaved == 0);
896
+ assert (n % qk == 0);
897
+ assert (nc % ncols_interleaved == 0);
666
898
 
667
899
  UNUSED(bs);
668
900
  UNUSED(nr);
669
901
 
670
902
  float sumf[8];
903
+ float sum_minf[8];
904
+ uint32_t utmp[32];
905
+ int sumi1;
906
+ int sumi2;
671
907
  int sumi;
672
908
 
673
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
909
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
674
910
  for (int x = 0; x < nc / ncols_interleaved; x++) {
675
- const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
911
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
676
912
 
677
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
913
+ for (int j = 0; j < ncols_interleaved; j++) {
914
+ sumf[j] = 0.0;
915
+ sum_minf[j] = 0.0;
916
+ }
678
917
  for (int l = 0; l < nb; l++) {
918
+ for (int sb = 0; sb < 8; sb++) {
919
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
920
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
921
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
922
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
923
+ utmp[sb * 4 + 2] = uaux_0;
924
+ utmp[sb * 4 + 0] &= kmask1;
925
+ }
679
926
  for (int k = 0; k < (qk / (2 * blocklen)); k++) {
927
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
928
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
680
929
  for (int j = 0; j < ncols_interleaved; j++) {
681
- sumi = 0;
930
+ sumi1 = 0;
931
+ sumi2 = 0;
932
+ sumi = 0;
933
+ for (int i = 0; i < blocklen; ++i) {
934
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
935
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
936
+ sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
937
+ sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
938
+ sumi1 = sumi1 * scales_0[j];
939
+ sumi2 = sumi2 * scales_1[j];
940
+ sumi += sumi1 + sumi2;
941
+ }
942
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
943
+ }
944
+ }
945
+ for (int sb = 0; sb < 8; sb++) {
946
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
947
+ for (int j = 0; j < ncols_interleaved; j++) {
948
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
949
+ }
950
+ }
951
+ }
952
+ for (int j = 0; j < ncols_interleaved; j++) {
953
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
954
+ }
955
+ }
956
+ }
957
+
958
+ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
959
+ const int qk = QK_K;
960
+ const int nb = n / qk;
961
+ const int ncols_interleaved = 8;
962
+ const int blocklen = 8;
963
+ static const uint32_t kmask1 = 0x3f3f3f3f;
964
+ static const uint32_t kmask2 = 0x0f0f0f0f;
965
+ static const uint32_t kmask3 = 0x03030303;
966
+
967
+ assert (n % qk == 0);
968
+ assert (nc % ncols_interleaved == 0);
969
+
970
+ UNUSED(bs);
971
+ UNUSED(nr);
972
+
973
+ float sumf[8];
974
+ float sum_minf[8];
975
+ uint32_t utmp[32];
976
+ int sumi1;
977
+ int sumi2;
978
+ int sumi;
979
+
980
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
981
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
982
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
983
+
984
+ for (int j = 0; j < ncols_interleaved; j++) {
985
+ sumf[j] = 0.0;
986
+ sum_minf[j] = 0.0;
987
+ }
988
+ for (int l = 0; l < nb; l++) {
989
+ for (int sb = 0; sb < 8; sb++) {
990
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
991
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
992
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
993
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
994
+ utmp[sb * 4 + 2] = uaux_0;
995
+ utmp[sb * 4 + 0] &= kmask1;
996
+ }
997
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
998
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
999
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
1000
+ for (int j = 0; j < ncols_interleaved; j++) {
1001
+ sumi1 = 0;
1002
+ sumi2 = 0;
1003
+ sumi = 0;
1004
+ for (int i = 0; i < blocklen; ++i) {
1005
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1006
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1007
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
1008
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
1009
+ sumi1 = sumi1 * scales_0[j];
1010
+ sumi2 = sumi2 * scales_1[j];
1011
+ sumi += sumi1 + sumi2;
1012
+ }
1013
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1014
+ }
1015
+ }
1016
+ for (int sb = 0; sb < 8; sb++) {
1017
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1018
+ for (int j = 0; j < ncols_interleaved; j++) {
1019
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1020
+ }
1021
+ }
1022
+ }
1023
+ for (int j = 0; j < ncols_interleaved; j++) {
1024
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
1025
+ }
1026
+ }
1027
+ }
1028
+
1029
+ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1030
+ const int qk = QK_K;
1031
+ const int nb = n / qk;
1032
+ const int ncols_interleaved = 8;
1033
+ const int blocklen = 8;
1034
+
1035
+ assert (n % qk == 0);
1036
+ assert (nc % ncols_interleaved == 0);
1037
+
1038
+ UNUSED(s);
1039
+ UNUSED(bs);
1040
+ UNUSED(vx);
1041
+ UNUSED(vy);
1042
+ UNUSED(nr);
1043
+ UNUSED(nc);
1044
+ UNUSED(nb);
1045
+ UNUSED(ncols_interleaved);
1046
+ UNUSED(blocklen);
1047
+
1048
+ float sumf[8];
1049
+ float sum_minf[8];
1050
+ int sumi1,sumi2,sumi3,sumi4;
1051
+ int sumi;
1052
+
1053
+ const block_q8_K * a_ptr = (const block_q8_K *)vy;
1054
+ for(int x = 0; x < nc / ncols_interleaved; x++) {
1055
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
1056
+ for (int j = 0; j < ncols_interleaved; j++) {
1057
+ sumf[j] = 0.0;
1058
+ sum_minf[j] = 0.0;
1059
+ }
1060
+ for (int l = 0; l < nb; l++) {
1061
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
1062
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
1063
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
1064
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
1065
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
1066
+ for (int j = 0; j < ncols_interleaved; j++) {
1067
+ sumi1 = 0;
1068
+ sumi2 = 0;
1069
+ sumi3 = 0;
1070
+ sumi4 = 0;
1071
+ sumi = 0;
1072
+ int offset = ((k / 2) % 2) + j * 2;
1073
+ for (int i = 0; i < blocklen; ++i){
1074
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
1075
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
1076
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
1077
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
1078
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
1079
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
1080
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
1081
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
1082
+
1083
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
1084
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
1085
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
1086
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
1087
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
1088
+ }
1089
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1090
+ }
1091
+ }
1092
+ for(int sb = 0; sb < 8; sb++) {
1093
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
1094
+ for(int j = 0; j < ncols_interleaved; j++){
1095
+ sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1096
+ }
1097
+ }
1098
+ }
1099
+ for (int j = 0; j < ncols_interleaved; j++) {
1100
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
1101
+ }
1102
+ }
1103
+ }
1104
+
1105
+ void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1106
+ ggml_gemv_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
1107
+ }
1108
+
1109
+ void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1110
+ ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
1111
+ }
1112
+
1113
+
1114
+ void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1115
+ ggml_gemv_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
1116
+ }
1117
+
1118
+ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1119
+ ggml_gemv_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
1120
+ }
1121
+
1122
+ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1123
+ const int qk = QK8_0;
1124
+ const int nb = n / qk;
1125
+ const int ncols_interleaved = 4;
1126
+ const int blocklen = 4;
1127
+
1128
+ assert(nr == 1);
1129
+ assert(n % qk == 0);
1130
+ assert(nc % ncols_interleaved == 0);
1131
+
1132
+ UNUSED(bs);
1133
+ UNUSED(nr);
1134
+
1135
+ float sumf[4];
1136
+ int sumi;
1137
+
1138
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1139
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1140
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
1141
+
1142
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1143
+ for (int l = 0; l < nb; l++) {
1144
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1145
+ for (int j = 0; j < ncols_interleaved; j++) {
1146
+ sumi = 0;
1147
+ for (int i = 0; i < blocklen; ++i) {
1148
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1149
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1150
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1151
+ }
1152
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1153
+ }
1154
+ }
1155
+ }
1156
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1157
+ }
1158
+ }
1159
+
1160
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1161
+ const int qk = QK8_0;
1162
+ const int nb = n / qk;
1163
+ const int ncols_interleaved = 8;
1164
+ const int blocklen = 8;
1165
+
1166
+ assert(nr == 1);
1167
+ assert(n % qk == 0);
1168
+ assert(nc % ncols_interleaved == 0);
1169
+
1170
+ UNUSED(bs);
1171
+ UNUSED(nr);
1172
+
1173
+ float sumf[8];
1174
+ int sumi;
1175
+
1176
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1177
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1178
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
1179
+
1180
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1181
+ for (int l = 0; l < nb; l++) {
1182
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1183
+ for (int j = 0; j < ncols_interleaved; j++) {
1184
+ sumi = 0;
682
1185
  for (int i = 0; i < blocklen; ++i) {
683
1186
  const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
684
1187
  const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
685
1188
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
686
1189
  }
687
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1190
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1191
+ }
1192
+ }
1193
+ }
1194
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1195
+ }
1196
+ }
1197
+
1198
+ void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1199
+ const int qk = QK8_0;
1200
+ const int nb = n / qk;
1201
+ const int ncols_interleaved = 4;
1202
+ const int blocklen = 4;
1203
+
1204
+ assert(nr == 1);
1205
+ assert(n % qk == 0);
1206
+ assert(nc % ncols_interleaved == 0);
1207
+
1208
+ UNUSED(bs);
1209
+ UNUSED(nr);
1210
+
1211
+ float sumf[4];
1212
+ int sumi;
1213
+
1214
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1215
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1216
+ const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
1217
+
1218
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1219
+ for (int l = 0; l < nb; l++) {
1220
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1221
+ for (int j = 0; j < ncols_interleaved; j++) {
1222
+ sumi = 0;
1223
+ for (int i = 0; i < blocklen; ++i) {
1224
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1225
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1226
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1227
+ }
1228
+ sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1229
+ }
1230
+ }
1231
+ }
1232
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1233
+ }
1234
+ }
1235
+
1236
+ void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1237
+ const int qk = QK8_0;
1238
+ const int nb = n / qk;
1239
+ const int ncols_interleaved = 8;
1240
+ const int blocklen = 8;
1241
+
1242
+ assert(nr == 1);
1243
+ assert(n % qk == 0);
1244
+ assert(nc % ncols_interleaved == 0);
1245
+
1246
+ UNUSED(bs);
1247
+ UNUSED(nr);
1248
+
1249
+ float sumf[8];
1250
+ int sumi;
1251
+
1252
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1253
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1254
+ const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
1255
+
1256
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1257
+ for (int l = 0; l < nb; l++) {
1258
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1259
+ for (int j = 0; j < ncols_interleaved; j++) {
1260
+ sumi = 0;
1261
+ for (int i = 0; i < blocklen; ++i) {
1262
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1263
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1264
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1265
+ }
1266
+ sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1267
+ }
1268
+ }
1269
+ }
1270
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1271
+ }
1272
+ }
1273
+
1274
+ void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
1275
+ float * GGML_RESTRICT s,
1276
+ size_t bs,
1277
+ const void * GGML_RESTRICT vx,
1278
+ const void * GGML_RESTRICT vy,
1279
+ int nr,
1280
+ int nc) {
1281
+ const int qk = QK8_0;
1282
+ const int nb = n / qk;
1283
+ const int ncols_interleaved = 4;
1284
+ const int blocklen = 4;
1285
+
1286
+ assert(nr == 1);
1287
+ assert(n % qk == 0);
1288
+ assert(nc % ncols_interleaved == 0);
1289
+
1290
+ UNUSED(bs);
1291
+ UNUSED(nr);
1292
+
1293
+ float sumf[4];
1294
+ int sumi;
1295
+
1296
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1297
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1298
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1299
+
1300
+ for (int j = 0; j < ncols_interleaved; j++) {
1301
+ sumf[j] = 0.0;
1302
+ }
1303
+ for (int l = 0; l < nb; l++) {
1304
+ for (int k = 0; k < (qk / blocklen); k++) {
1305
+ for (int j = 0; j < ncols_interleaved; j++) {
1306
+ sumi = 0;
1307
+ for (int i = 0; i < blocklen; ++i) {
1308
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1309
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
1310
+ }
1311
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1312
+ }
1313
+ }
1314
+ }
1315
+ for (int j = 0; j < ncols_interleaved; j++) {
1316
+ s[x * ncols_interleaved + j] = sumf[j];
1317
+ }
1318
+ }
1319
+ }
1320
+
1321
+ void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
1322
+ float * GGML_RESTRICT s,
1323
+ size_t bs,
1324
+ const void * GGML_RESTRICT vx,
1325
+ const void * GGML_RESTRICT vy,
1326
+ int nr,
1327
+ int nc) {
1328
+ const int qk = QK8_0;
1329
+ const int nb = n / qk;
1330
+ const int ncols_interleaved = 4;
1331
+ const int blocklen = 8;
1332
+
1333
+ assert(nr == 1);
1334
+ assert(n % qk == 0);
1335
+ assert(nc % ncols_interleaved == 0);
1336
+
1337
+ UNUSED(bs);
1338
+ UNUSED(nr);
1339
+
1340
+ float sumf[4];
1341
+ int sumi;
1342
+
1343
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1344
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1345
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1346
+
1347
+ for (int j = 0; j < ncols_interleaved; j++) {
1348
+ sumf[j] = 0.0;
1349
+ }
1350
+ for (int l = 0; l < nb; l++) {
1351
+ for (int k = 0; k < (qk / blocklen); k++) {
1352
+ for (int j = 0; j < ncols_interleaved; j++) {
1353
+ sumi = 0;
1354
+ for (int i = 0; i < blocklen; ++i) {
1355
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1356
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
1357
+ }
1358
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1359
+ }
1360
+ }
1361
+ }
1362
+ for (int j = 0; j < ncols_interleaved; j++) {
1363
+ s[x * ncols_interleaved + j] = sumf[j];
1364
+ }
1365
+ }
1366
+ }
1367
+
1368
+ #if defined __riscv_zvfh
1369
+ void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1370
+ const int qk = QK8_0;
1371
+ const int nb = n / qk;
1372
+ const int ncols_interleaved = 16;
1373
+ const int blocklen = 1;
1374
+
1375
+ assert (n % qk == 0);
1376
+ assert (nc % ncols_interleaved == 0);
1377
+
1378
+ UNUSED(s);
1379
+ UNUSED(bs);
1380
+ UNUSED(vx);
1381
+ UNUSED(vy);
1382
+ UNUSED(nr);
1383
+ UNUSED(nc);
1384
+ UNUSED(nb);
1385
+ UNUSED(ncols_interleaved);
1386
+ UNUSED(blocklen);
1387
+
1388
+ float sumf[16];
1389
+ int sumi;
1390
+
1391
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1392
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1393
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
1394
+
1395
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1396
+ for (int l = 0; l < nb; l++) {
1397
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1398
+ for (int j = 0; j < ncols_interleaved; j++) {
1399
+ sumi = 0;
1400
+ for (int i = 0; i < blocklen; ++i) {
1401
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1402
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1403
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
1404
+ }
1405
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1406
+ }
1407
+ }
1408
+ }
1409
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1410
+ }
1411
+ }
1412
+
1413
+ void ggml_gemv_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1414
+ const int qk = QK_K;
1415
+ const int nb = n / qk;
1416
+ const int ncols_interleaved = 16;
1417
+ const int blocklen = 1;
1418
+ assert (n % qk == 0);
1419
+ assert (nc % ncols_interleaved == 0);
1420
+ UNUSED(s);
1421
+ UNUSED(bs);
1422
+ UNUSED(vx);
1423
+ UNUSED(vy);
1424
+ UNUSED(nr);
1425
+ UNUSED(nc);
1426
+ UNUSED(nb);
1427
+ UNUSED(ncols_interleaved);
1428
+ UNUSED(blocklen);
1429
+ float sumf[16];
1430
+ float sum_minf[16];
1431
+ uint8_t scales[128];
1432
+ uint8_t mins[128];
1433
+ int sumi1;
1434
+ int sumi2;
1435
+ int sumi;
1436
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
1437
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1438
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
1439
+ for (int j = 0; j < ncols_interleaved; j++) {
1440
+ sumf[j] = 0.0f;
1441
+ sum_minf[j] = 0.0f;
1442
+ }
1443
+ for (int l = 0; l < nb; l++) {
1444
+ for (int i = 0; i < 128; i++) {
1445
+ scales[i] = b_ptr[l].scales[i] & 0x0F;
1446
+ mins[i] = b_ptr[l].scales[i] >> 4;
1447
+ }
1448
+ for (int i = 0; i < 64; i++) {
1449
+ scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
1450
+ mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
1451
+ scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
1452
+ mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
1453
+ }
1454
+ for (int sb = 0; sb < 8; sb++) {
1455
+ uint8_t *min = &mins[sb * 16];
1456
+ for (int j = 0; j < ncols_interleaved; j++) {
1457
+ sum_minf[j] += min[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1458
+ }
1459
+ }
1460
+ for (int sb = 0; sb < 8; sb += 2) {
1461
+ uint8_t *scales_0 = &scales[sb * 16];
1462
+ uint8_t *scales_1 = &scales[(sb + 1) * 16];
1463
+ for (int i = 0; i < QK4_0; i++) {
1464
+ for (int j = 0; j < ncols_interleaved; j++) {
1465
+ sumi1 = 0;
1466
+ sumi2 = 0;
1467
+ sumi = 0;
1468
+ const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
1469
+ const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
1470
+ sumi1 = (v0 * a_ptr[l].qs[sb * 32 + i]);
1471
+ sumi2 = (v1 * a_ptr[l].qs[sb * 32 + 32 + i]);
1472
+ sumi1 = sumi1 * scales_0[j];
1473
+ sumi2 = sumi2 * scales_1[j];
1474
+ sumi += sumi1 + sumi2;
1475
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1476
+ }
1477
+ }
1478
+ }
1479
+ }
1480
+ for (int j = 0; j < ncols_interleaved; j++) {
1481
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
1482
+ }
1483
+ }
1484
+ }
1485
+
1486
+ void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1487
+ const int qk = QK8_0;
1488
+ const int nb = n / qk;
1489
+ const int ncols_interleaved = 16;
1490
+ const int blocklen = 1;
1491
+
1492
+ assert(nr == 1);
1493
+ assert(n % qk == 0);
1494
+ assert(nc % ncols_interleaved == 0);
1495
+
1496
+ UNUSED(bs);
1497
+ UNUSED(nr);
1498
+
1499
+ float sumf[16];
1500
+ int sumi;
1501
+
1502
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1503
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1504
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
1505
+
1506
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1507
+ for (int l = 0; l < nb; l++) {
1508
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1509
+ for (int j = 0; j < ncols_interleaved; j++) {
1510
+ sumi = 0;
1511
+ for (int i = 0; i < blocklen; ++i) {
1512
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1513
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1514
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1515
+ }
1516
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1517
+ }
1518
+ }
1519
+ }
1520
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1521
+ }
1522
+ }
1523
+
1524
+ void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1525
+ const int qk = QK8_0;
1526
+ const int nb = n / qk;
1527
+ const int ncols_interleaved = 16;
1528
+ const int blocklen = 1;
1529
+
1530
+ assert(nr == 1);
1531
+ assert(n % qk == 0);
1532
+ assert(nc % ncols_interleaved == 0);
1533
+
1534
+ UNUSED(bs);
1535
+ UNUSED(nr);
1536
+
1537
+ float sumf[16];
1538
+ int sumi;
1539
+
1540
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1541
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1542
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
1543
+
1544
+ for (int j = 0; j < ncols_interleaved; j++) {
1545
+ sumf[j] = 0.0;
1546
+ }
1547
+ for (int l = 0; l < nb; l++) {
1548
+ for (int k = 0; k < (qk / blocklen); k++) {
1549
+ for (int j = 0; j < ncols_interleaved; j++) {
1550
+ sumi = 0;
1551
+ for (int i = 0; i < blocklen; ++i) {
1552
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1553
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
1554
+ }
1555
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1556
+ }
1557
+ }
1558
+ }
1559
+ for (int j = 0; j < ncols_interleaved; j++) {
1560
+ s[x * ncols_interleaved + j] = sumf[j];
1561
+ }
1562
+ }
1563
+ }
1564
+
1565
+ void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1566
+ assert(n % QK_K == 0);
1567
+ assert(nr == 1);
1568
+ assert(nc % 16 == 0);
1569
+
1570
+ UNUSED(bs);
1571
+
1572
+ const int nb = n / QK_K;
1573
+ const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
1574
+ const block_q8_K * y = (const block_q8_K *)vy;
1575
+
1576
+ // Layout: Even-Low(0,2,4,6), Odd-Low(1,3,5,7), Even-High(8...), Odd-High(9...)
1577
+ const int sb_perm[16] = {
1578
+ 0, 4, 1, 5, 2, 6, 3, 7, // 0-7
1579
+ 8, 12, 9, 13, 10, 14, 11, 15 // 8-15
1580
+ };
1581
+
1582
+ for (int col_tile = 0; col_tile < nc; col_tile += 16) {
1583
+ const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
1584
+ const block_q8_K * y_ptr = y;
1585
+
1586
+ float sumf[16] = {0};
1587
+
1588
+ // Loop over K-blocks
1589
+ for (int k_block = 0; k_block < nb; ++k_block) {
1590
+ int32_t isum[16] = {0};
1591
+ int32_t summs[16] = {0};
1592
+
1593
+ const uint8_t * qs_rhs = x_ptr[k_block].qs;
1594
+ const uint8_t * sc_rhs = x_ptr[k_block].scales;
1595
+ const int8_t * qs_lhs = y_ptr[k_block].qs;
1596
+ const int16_t * bs_lhs = y_ptr[k_block].bsums;
1597
+
1598
+ // Iterate over sub-blocks 0..15
1599
+ for (int sb = 0; sb < 16; ++sb) {
1600
+ // Correction Term
1601
+ int16_t bsum = bs_lhs[sb];
1602
+ int scale_offset = sb_perm[sb] * 16;
1603
+
1604
+ for (int col = 0; col < 16; ++col) {
1605
+ uint8_t sc_val = sc_rhs[scale_offset + col];
1606
+ summs[col] += bsum * (sc_val >> 4); // Min is high 4 bits
1607
+ }
1608
+
1609
+ // Main Dot Product
1610
+ // Calculate base offsets for Q2 unpacking based on SB
1611
+ int byte_base;
1612
+ if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
1613
+ else byte_base = (sb % 2 == 0) ? 32 : 48;
1614
+
1615
+ int shift = ((sb / 2) % 4) * 2;
1616
+
1617
+ for (int col = 0; col < 16; ++col) {
1618
+ uint8_t sc_val = sc_rhs[scale_offset + col];
1619
+ int32_t d_sb = sc_val & 0xF; // Scale is low 4 bits
1620
+
1621
+ // Process 16 elements (l=0..15)
1622
+ for (int l = 0; l < 16; ++l) {
1623
+ // Q2: Interleaved by column. Byte `l` contains 4 k-values.
1624
+ int qs_idx = (byte_base + l) * 16 + col;
1625
+ uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
1626
+
1627
+ // Q8: Linear access
1628
+ int k = sb * 16 + l;
1629
+ int8_t q8_val = qs_lhs[k];
1630
+
1631
+ isum[col] += q8_val * q2_val * d_sb;
1632
+ }
1633
+ }
1634
+ }
1635
+
1636
+ // Finalize K-Block
1637
+ for (int col = 0; col < 16; ++col) {
1638
+ float d_lhs = y_ptr[k_block].d;
1639
+ float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
1640
+ float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
1641
+
1642
+ float d_all = d_lhs * d_rhs;
1643
+ float d_min = d_lhs * dm_rhs;
1644
+
1645
+ sumf[col] += (isum[col] * d_all) - (summs[col] * d_min);
1646
+ }
1647
+ }
1648
+
1649
+ for (int col = 0; col < 16; ++col) {
1650
+ s[col_tile + col] = sumf[col];
1651
+ }
1652
+ }
1653
+ }
1654
+ #endif
1655
+
1656
+ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1657
+ const int qk = QK8_0;
1658
+ const int nb = n / qk;
1659
+ const int ncols_interleaved = 4;
1660
+ const int blocklen = 4;
1661
+
1662
+ assert (n % qk == 0);
1663
+ assert (nr % 4 == 0);
1664
+ assert (nc % ncols_interleaved == 0);
1665
+
1666
+ UNUSED(s);
1667
+ UNUSED(bs);
1668
+ UNUSED(vx);
1669
+ UNUSED(vy);
1670
+ UNUSED(nr);
1671
+ UNUSED(nc);
1672
+ UNUSED(nb);
1673
+ UNUSED(ncols_interleaved);
1674
+ UNUSED(blocklen);
1675
+
1676
+ {
1677
+ float sumf[4][4];
1678
+ int sumi;
1679
+
1680
+ for (int y = 0; y < nr / 4; y++) {
1681
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1682
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1683
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1684
+ for (int m = 0; m < 4; m++) {
1685
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1686
+ }
1687
+ for (int l = 0; l < nb; l++) {
1688
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1689
+ for (int m = 0; m < 4; m++) {
1690
+ for (int j = 0; j < ncols_interleaved; j++) {
1691
+ sumi = 0;
1692
+ for (int i = 0; i < blocklen; ++i) {
1693
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1694
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1695
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1696
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1697
+ }
1698
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1699
+ }
1700
+ }
1701
+ }
1702
+ }
1703
+ for (int m = 0; m < 4; m++) {
1704
+ for (int j = 0; j < ncols_interleaved; j++)
1705
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1706
+ }
1707
+ }
1708
+ }
1709
+ }
1710
+ }
1711
+
1712
+ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1713
+ const int qk = QK8_0;
1714
+ const int nb = n / qk;
1715
+ const int ncols_interleaved = 4;
1716
+ const int blocklen = 8;
1717
+
1718
+ assert (n % qk == 0);
1719
+ assert (nr % 4 == 0);
1720
+ assert (nc % ncols_interleaved == 0);
1721
+
1722
+ UNUSED(s);
1723
+ UNUSED(bs);
1724
+ UNUSED(vx);
1725
+ UNUSED(vy);
1726
+ UNUSED(nr);
1727
+ UNUSED(nc);
1728
+ UNUSED(nb);
1729
+ UNUSED(ncols_interleaved);
1730
+ UNUSED(blocklen);
1731
+
1732
+ float sumf[4][4];
1733
+ int sumi;
1734
+
1735
+ for (int y = 0; y < nr / 4; y++) {
1736
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1737
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1738
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1739
+ for (int m = 0; m < 4; m++) {
1740
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1741
+ }
1742
+ for (int l = 0; l < nb; l++) {
1743
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1744
+ for (int m = 0; m < 4; m++) {
1745
+ for (int j = 0; j < ncols_interleaved; j++) {
1746
+ sumi = 0;
1747
+ for (int i = 0; i < blocklen; ++i) {
1748
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1749
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1750
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1751
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1752
+ }
1753
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1754
+ }
1755
+ }
1756
+ }
1757
+ }
1758
+ for (int m = 0; m < 4; m++) {
1759
+ for (int j = 0; j < ncols_interleaved; j++)
1760
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1761
+ }
1762
+ }
1763
+ }
1764
+ }
1765
+
1766
+ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1767
+ const int qk = QK8_0;
1768
+ const int nb = n / qk;
1769
+ const int ncols_interleaved = 8;
1770
+ const int blocklen = 8;
1771
+
1772
+ assert (n % qk == 0);
1773
+ assert (nr % 4 == 0);
1774
+ assert (nc % ncols_interleaved == 0);
1775
+
1776
+ UNUSED(s);
1777
+ UNUSED(bs);
1778
+ UNUSED(vx);
1779
+ UNUSED(vy);
1780
+ UNUSED(nr);
1781
+ UNUSED(nc);
1782
+ UNUSED(nb);
1783
+ UNUSED(ncols_interleaved);
1784
+ UNUSED(blocklen);
1785
+
1786
+ float sumf[4][8];
1787
+ int sumi;
1788
+
1789
+ for (int y = 0; y < nr / 4; y++) {
1790
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1791
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1792
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
1793
+ for (int m = 0; m < 4; m++) {
1794
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1795
+ }
1796
+ for (int l = 0; l < nb; l++) {
1797
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1798
+ for (int m = 0; m < 4; m++) {
1799
+ for (int j = 0; j < ncols_interleaved; j++) {
1800
+ sumi = 0;
1801
+ for (int i = 0; i < blocklen; ++i) {
1802
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1803
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1804
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1805
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1806
+ }
1807
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1808
+ }
1809
+ }
1810
+ }
1811
+ }
1812
+ for (int m = 0; m < 4; m++) {
1813
+ for (int j = 0; j < ncols_interleaved; j++)
1814
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1815
+ }
1816
+ }
1817
+ }
1818
+ }
1819
+
1820
+ void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1821
+ const int qk = QK_K;
1822
+ const int nb = n / qk;
1823
+ const int ncols_interleaved = 8;
1824
+ const int blocklen = 4;
1825
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1826
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1827
+ static const uint32_t kmask3 = 0x03030303;
1828
+
1829
+ assert (n % qk == 0);
1830
+ assert (nr % 4 == 0);
1831
+ assert (nc % ncols_interleaved == 0);
1832
+
1833
+ UNUSED(nb);
1834
+ UNUSED(ncols_interleaved);
1835
+ UNUSED(blocklen);
1836
+
1837
+ float sumf[4][8];
1838
+ float sum_minf[4][8];
1839
+ uint32_t utmp[32];
1840
+ int sumi1;
1841
+ int sumi2;
1842
+ int sumi;
1843
+
1844
+ for (int y = 0; y < nr / 4; y++) {
1845
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
1846
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1847
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
1848
+ for (int m = 0; m < 4; m++) {
1849
+ for (int j = 0; j < ncols_interleaved; j++) {
1850
+ sumf[m][j] = 0.0;
1851
+ sum_minf[m][j] = 0.0;
1852
+ }
1853
+ }
1854
+ for (int l = 0; l < nb; l++) {
1855
+ for (int sb = 0; sb < 8; sb++) {
1856
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
1857
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
1858
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
1859
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
1860
+ utmp[sb * 4 + 2] = uaux_0;
1861
+ utmp[sb * 4 + 0] &= kmask1;
1862
+ }
1863
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1864
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
1865
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
1866
+ for (int m = 0; m < 4; m++) {
1867
+ for (int j = 0; j < ncols_interleaved; j++) {
1868
+ sumi1 = 0;
1869
+ sumi2 = 0;
1870
+ sumi = 0;
1871
+ for (int i = 0; i < blocklen; ++i) {
1872
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1873
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1874
+ sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
1875
+ sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
1876
+ sumi1 = sumi1 * scales_0[j];
1877
+ sumi2 = sumi2 * scales_1[j];
1878
+ sumi += sumi1 + sumi2;
1879
+ }
1880
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1881
+ }
1882
+ }
1883
+ }
1884
+ for (int sb = 0; sb < 8; sb++) {
1885
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
1886
+ for(int m = 0; m < 4; m++) {
1887
+ const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1888
+ for(int j = 0; j < ncols_interleaved; j++) {
1889
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
1890
+ }
1891
+ }
1892
+ }
1893
+ }
1894
+ for (int m = 0; m < 4; m++) {
1895
+ for (int j = 0; j < ncols_interleaved; j++) {
1896
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1897
+ }
1898
+ }
1899
+ }
1900
+ }
1901
+ }
1902
+
1903
+ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1904
+ const int qk = QK_K;
1905
+ const int nb = n / qk;
1906
+ const int ncols_interleaved = 8;
1907
+ const int blocklen = 8;
1908
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1909
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1910
+ static const uint32_t kmask3 = 0x03030303;
1911
+
1912
+ assert (n % qk == 0);
1913
+ assert (nr % 4 == 0);
1914
+ assert (nc % ncols_interleaved == 0);
1915
+
1916
+ UNUSED(bs);
1917
+
1918
+ float sumf[4][8];
1919
+ float sum_minf[4][8];
1920
+ uint32_t utmp[32];
1921
+ int sumi1;
1922
+ int sumi2;
1923
+ int sumi;
1924
+
1925
+ for (int y = 0; y < nr / 4; y++) {
1926
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
1927
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1928
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
1929
+ for (int m = 0; m < 4; m++) {
1930
+ for (int j = 0; j < ncols_interleaved; j++) {
1931
+ sumf[m][j] = 0.0;
1932
+ sum_minf[m][j] = 0.0;
1933
+ }
1934
+ }
1935
+ for (int l = 0; l < nb; l++) {
1936
+ for (int sb = 0; sb < 8; sb++) {
1937
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
1938
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
1939
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
1940
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
1941
+ utmp[sb * 4 + 2] = uaux_0;
1942
+ utmp[sb * 4 + 0] &= kmask1;
1943
+ }
1944
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1945
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
1946
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
1947
+ for (int m = 0; m < 4; m++) {
1948
+ for (int j = 0; j < ncols_interleaved; j++) {
1949
+ sumi1 = 0;
1950
+ sumi2 = 0;
1951
+ sumi = 0;
1952
+ for (int i = 0; i < blocklen; ++i) {
1953
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1954
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1955
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
1956
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
1957
+ sumi1 = sumi1 * scales_0[j];
1958
+ sumi2 = sumi2 * scales_1[j];
1959
+ sumi += sumi1 + sumi2;
1960
+ }
1961
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1962
+ }
1963
+ }
1964
+ }
1965
+ for (int sb = 0; sb < 8; sb++) {
1966
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1967
+ for(int m = 0; m < 4; m++) {
1968
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1969
+ for(int j = 0; j < ncols_interleaved; j++) {
1970
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
1971
+ }
1972
+ }
1973
+ }
1974
+ }
1975
+ for (int m = 0; m < 4; m++) {
1976
+ for (int j = 0; j < ncols_interleaved; j++) {
1977
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
688
1978
  }
689
1979
  }
690
1980
  }
691
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
692
1981
  }
693
1982
  }
694
1983
 
695
- void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
696
- float * GGML_RESTRICT s,
697
- size_t bs,
698
- const void * GGML_RESTRICT vx,
699
- const void * GGML_RESTRICT vy,
700
- int nr,
701
- int nc) {
702
- const int qk = QK8_0;
703
- const int nb = n / qk;
704
- const int ncols_interleaved = 4;
705
- const int blocklen = 4;
1984
+ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1985
+ const int qk = QK_K;
1986
+ const int nb = n / qk;
1987
+ const int ncols_interleaved = 8;
1988
+ const int blocklen = 8;
706
1989
 
707
- assert(nr == 1);
708
- assert(n % qk == 0);
709
- assert(nc % ncols_interleaved == 0);
1990
+ assert (n % qk == 0);
1991
+ assert (nr % 4 == 0);
1992
+ assert (nc % ncols_interleaved == 0);
710
1993
 
1994
+ UNUSED(s);
711
1995
  UNUSED(bs);
1996
+ UNUSED(vx);
1997
+ UNUSED(vy);
712
1998
  UNUSED(nr);
1999
+ UNUSED(nc);
2000
+ UNUSED(nb);
2001
+ UNUSED(ncols_interleaved);
2002
+ UNUSED(blocklen);
713
2003
 
714
- float sumf[4];
715
- int sumi;
716
-
717
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
718
- for (int x = 0; x < nc / ncols_interleaved; x++) {
719
- const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
2004
+ float sumf[4][8];
2005
+ float sum_minf[4][8];
2006
+ int sumi1, sumi2, sumi3, sumi4;
2007
+ int sumi;
720
2008
 
721
- for (int j = 0; j < ncols_interleaved; j++) {
722
- sumf[j] = 0.0;
723
- }
724
- for (int l = 0; l < nb; l++) {
725
- for (int k = 0; k < (qk / blocklen); k++) {
2009
+ for (int y = 0; y < nr / 4; y++) {
2010
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2011
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2012
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
2013
+ for (int m = 0; m < 4; m++) {
726
2014
  for (int j = 0; j < ncols_interleaved; j++) {
727
- sumi = 0;
728
- for (int i = 0; i < blocklen; ++i) {
729
- const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
730
- sumi += v0 * a_ptr[l].qs[k * blocklen + i];
2015
+ sumf[m][j] = 0.0;
2016
+ sum_minf[m][j] = 0.0;
2017
+ }
2018
+ }
2019
+ for (int l = 0; l < nb; l++) {
2020
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
2021
+
2022
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
2023
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
2024
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
2025
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
2026
+ for (int m = 0; m < 4; m++) {
2027
+ for (int j = 0; j < ncols_interleaved; j++) {
2028
+ sumi1 = 0;
2029
+ sumi2 = 0;
2030
+ sumi3 = 0;
2031
+ sumi4 = 0;
2032
+ sumi = 0;
2033
+ int offset = ((k / 2) % 2) + j * 2;
2034
+ for (int i = 0; i < blocklen; ++i){
2035
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
2036
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
2037
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
2038
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
2039
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
2040
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
2041
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
2042
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
2043
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
2044
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
2045
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
2046
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
2047
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
2048
+ }
2049
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
2050
+ }
2051
+ }
2052
+ }
2053
+ for(int sb = 0; sb < 8; sb++) {
2054
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
2055
+ for(int m = 0; m < 4; m++) {
2056
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
2057
+ for(int j = 0; j < ncols_interleaved; j++) {
2058
+ int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
2059
+ sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2060
+ }
731
2061
  }
732
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
733
2062
  }
734
2063
  }
735
- }
736
- for (int j = 0; j < ncols_interleaved; j++) {
737
- s[x * ncols_interleaved + j] = sumf[j];
2064
+
2065
+ for (int m = 0; m < 4; m++) {
2066
+ for (int j = 0; j < ncols_interleaved; j++) {
2067
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
2068
+ }
2069
+ }
738
2070
  }
739
2071
  }
740
2072
  }
741
2073
 
742
- void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
743
- float * GGML_RESTRICT s,
744
- size_t bs,
745
- const void * GGML_RESTRICT vx,
746
- const void * GGML_RESTRICT vy,
747
- int nr,
748
- int nc) {
749
- const int qk = QK8_0;
750
- const int nb = n / qk;
751
- const int ncols_interleaved = 4;
752
- const int blocklen = 8;
753
-
754
- assert(nr == 1);
755
- assert(n % qk == 0);
756
- assert(nc % ncols_interleaved == 0);
757
-
758
- UNUSED(bs);
759
- UNUSED(nr);
2074
+ void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2075
+ ggml_gemm_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
2076
+ }
760
2077
 
761
- float sumf[4];
762
- int sumi;
2078
+ void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2079
+ ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
2080
+ }
763
2081
 
764
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
765
- for (int x = 0; x < nc / ncols_interleaved; x++) {
766
- const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
2082
+ void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2083
+ ggml_gemm_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
2084
+ }
767
2085
 
768
- for (int j = 0; j < ncols_interleaved; j++) {
769
- sumf[j] = 0.0;
770
- }
771
- for (int l = 0; l < nb; l++) {
772
- for (int k = 0; k < (qk / blocklen); k++) {
773
- for (int j = 0; j < ncols_interleaved; j++) {
774
- sumi = 0;
775
- for (int i = 0; i < blocklen; ++i) {
776
- const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
777
- sumi += v0 * a_ptr[l].qs[k * blocklen + i];
778
- }
779
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
780
- }
781
- }
782
- }
783
- for (int j = 0; j < ncols_interleaved; j++) {
784
- s[x * ncols_interleaved + j] = sumf[j];
785
- }
786
- }
2086
+ void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2087
+ ggml_gemm_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
787
2088
  }
788
2089
 
789
- void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2090
+ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
790
2091
  const int qk = QK8_0;
791
2092
  const int nb = n / qk;
792
2093
  const int ncols_interleaved = 4;
@@ -813,7 +2114,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
813
2114
  for (int y = 0; y < nr / 4; y++) {
814
2115
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
815
2116
  for (int x = 0; x < nc / ncols_interleaved; x++) {
816
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
2117
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
817
2118
  for (int m = 0; m < 4; m++) {
818
2119
  for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
819
2120
  }
@@ -823,10 +2124,10 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
823
2124
  for (int j = 0; j < ncols_interleaved; j++) {
824
2125
  sumi = 0;
825
2126
  for (int i = 0; i < blocklen; ++i) {
826
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
827
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2127
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2128
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
828
2129
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
829
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2130
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
830
2131
  }
831
2132
  sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
832
2133
  }
@@ -842,33 +2143,23 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
842
2143
  }
843
2144
  }
844
2145
 
845
- void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2146
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
846
2147
  const int qk = QK8_0;
847
2148
  const int nb = n / qk;
848
- const int ncols_interleaved = 4;
2149
+ const int ncols_interleaved = 8;
849
2150
  const int blocklen = 8;
850
2151
 
851
- assert (n % qk == 0);
852
- assert (nr % 4 == 0);
853
- assert (nc % ncols_interleaved == 0);
854
-
855
- UNUSED(s);
856
- UNUSED(bs);
857
- UNUSED(vx);
858
- UNUSED(vy);
859
- UNUSED(nr);
860
- UNUSED(nc);
861
- UNUSED(nb);
862
- UNUSED(ncols_interleaved);
863
- UNUSED(blocklen);
2152
+ assert(n % qk == 0);
2153
+ assert(nr % 4 == 0);
2154
+ assert(nc % ncols_interleaved == 0);
864
2155
 
865
- float sumf[4][4];
2156
+ float sumf[4][8];
866
2157
  int sumi;
867
2158
 
868
2159
  for (int y = 0; y < nr / 4; y++) {
869
2160
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
870
2161
  for (int x = 0; x < nc / ncols_interleaved; x++) {
871
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
2162
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
872
2163
  for (int m = 0; m < 4; m++) {
873
2164
  for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
874
2165
  }
@@ -878,10 +2169,10 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
878
2169
  for (int j = 0; j < ncols_interleaved; j++) {
879
2170
  sumi = 0;
880
2171
  for (int i = 0; i < blocklen; ++i) {
881
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
882
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2172
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2173
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
883
2174
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
884
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2175
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
885
2176
  }
886
2177
  sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
887
2178
  }
@@ -896,25 +2187,59 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
896
2187
  }
897
2188
  }
898
2189
 
899
- void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2190
+ void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
900
2191
  const int qk = QK8_0;
901
2192
  const int nb = n / qk;
902
- const int ncols_interleaved = 8;
903
- const int blocklen = 8;
2193
+ const int ncols_interleaved = 4;
2194
+ const int blocklen = 4;
904
2195
 
905
- assert (n % qk == 0);
906
- assert (nr % 4 == 0);
907
- assert (nc % ncols_interleaved == 0);
2196
+ assert(n % qk == 0);
2197
+ assert(nr % 4 == 0);
2198
+ assert(nc % ncols_interleaved == 0);
908
2199
 
909
- UNUSED(s);
910
- UNUSED(bs);
911
- UNUSED(vx);
912
- UNUSED(vy);
913
- UNUSED(nr);
914
- UNUSED(nc);
915
- UNUSED(nb);
916
- UNUSED(ncols_interleaved);
917
- UNUSED(blocklen);
2200
+ float sumf[4][4];
2201
+ int sumi;
2202
+
2203
+ for (int y = 0; y < nr / 4; y++) {
2204
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2205
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2206
+ const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
2207
+ for (int m = 0; m < 4; m++) {
2208
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2209
+ }
2210
+ for (int l = 0; l < nb; l++) {
2211
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2212
+ for (int m = 0; m < 4; m++) {
2213
+ for (int j = 0; j < ncols_interleaved; j++) {
2214
+ sumi = 0;
2215
+ for (int i = 0; i < blocklen; ++i) {
2216
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2217
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
2218
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2219
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2220
+ }
2221
+ sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2222
+ }
2223
+ }
2224
+ }
2225
+ }
2226
+ for (int m = 0; m < 4; m++) {
2227
+ for (int j = 0; j < ncols_interleaved; j++)
2228
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2229
+ }
2230
+ }
2231
+ }
2232
+ }
2233
+
2234
+ void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2235
+ const int qk = QK8_0;
2236
+ const int nb = n / qk;
2237
+ const int ncols_interleaved = 8;
2238
+ const int blocklen = 8;
2239
+
2240
+ assert(n % qk == 0);
2241
+ assert(nr % 4 == 0);
2242
+ assert(nc % ncols_interleaved == 0);
918
2243
 
919
2244
  float sumf[4][8];
920
2245
  int sumi;
@@ -922,7 +2247,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
922
2247
  for (int y = 0; y < nr / 4; y++) {
923
2248
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
924
2249
  for (int x = 0; x < nc / ncols_interleaved; x++) {
925
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
2250
+ const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
926
2251
  for (int m = 0; m < 4; m++) {
927
2252
  for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
928
2253
  }
@@ -932,12 +2257,12 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
932
2257
  for (int j = 0; j < ncols_interleaved; j++) {
933
2258
  sumi = 0;
934
2259
  for (int i = 0; i < blocklen; ++i) {
935
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
936
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2260
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2261
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
937
2262
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
938
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2263
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
939
2264
  }
940
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2265
+ sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
941
2266
  }
942
2267
  }
943
2268
  }
@@ -950,183 +2275,118 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
950
2275
  }
951
2276
  }
952
2277
 
953
- void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
954
- const int qk = QK_K;
955
- const int nb = n / qk;
956
- const int ncols_interleaved = 8;
957
- const int blocklen = 4;
958
- static const uint32_t kmask1 = 0x3f3f3f3f;
959
- static const uint32_t kmask2 = 0x0f0f0f0f;
960
- static const uint32_t kmask3 = 0x03030303;
961
-
962
- assert (n % qk == 0);
963
- assert (nr % 4 == 0);
964
- assert (nc % ncols_interleaved == 0);
2278
+ void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
2279
+ float * GGML_RESTRICT s,
2280
+ size_t bs,
2281
+ const void * GGML_RESTRICT vx,
2282
+ const void * GGML_RESTRICT vy,
2283
+ int nr,
2284
+ int nc) {
2285
+ const int qk = QK8_0;
2286
+ const int nb = n / qk;
2287
+ const int ncols_interleaved = 4;
2288
+ const int blocklen = 4;
965
2289
 
966
- UNUSED(nb);
967
- UNUSED(ncols_interleaved);
968
- UNUSED(blocklen);
2290
+ assert(n % qk == 0);
2291
+ assert(nr % 4 == 0);
2292
+ assert(nc % ncols_interleaved == 0);
969
2293
 
970
- float sumf[4][8];
971
- float sum_minf[4][8];
972
- uint32_t utmp[32];
973
- int sumi1;
974
- int sumi2;
975
- int sumi;
2294
+ float sumf[4][4];
2295
+ int sumi;
976
2296
 
977
2297
  for (int y = 0; y < nr / 4; y++) {
978
- const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2298
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
979
2299
  for (int x = 0; x < nc / ncols_interleaved; x++) {
980
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
2300
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
981
2301
  for (int m = 0; m < 4; m++) {
982
2302
  for (int j = 0; j < ncols_interleaved; j++) {
983
2303
  sumf[m][j] = 0.0;
984
- sum_minf[m][j] = 0.0;
985
2304
  }
986
2305
  }
987
2306
  for (int l = 0; l < nb; l++) {
988
- for (int sb = 0; sb < 8; sb++) {
989
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
990
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
991
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
992
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
993
- utmp[sb * 4 + 2] = uaux_0;
994
- utmp[sb * 4 + 0] &= kmask1;
995
- }
996
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
997
- uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
998
- uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
2307
+ for (int k = 0; k < (qk / blocklen); k++) {
999
2308
  for (int m = 0; m < 4; m++) {
1000
2309
  for (int j = 0; j < ncols_interleaved; j++) {
1001
- sumi1 = 0;
1002
- sumi2 = 0;
1003
2310
  sumi = 0;
1004
2311
  for (int i = 0; i < blocklen; ++i) {
1005
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1006
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1007
- sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
1008
- sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
1009
- sumi1 = sumi1 * scales_0[j];
1010
- sumi2 = sumi2 * scales_1[j];
1011
- sumi += sumi1 + sumi2;
2312
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
2313
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1012
2314
  }
1013
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1014
- }
1015
- }
1016
- }
1017
- for (int sb = 0; sb < 8; sb++) {
1018
- uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
1019
- for(int m = 0; m < 4; m++) {
1020
- const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1021
- for(int j = 0; j < ncols_interleaved; j++) {
1022
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2315
+ sumf[m][j] +=
2316
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1023
2317
  }
1024
2318
  }
1025
2319
  }
1026
2320
  }
1027
2321
  for (int m = 0; m < 4; m++) {
1028
2322
  for (int j = 0; j < ncols_interleaved; j++) {
1029
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
2323
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1030
2324
  }
1031
2325
  }
1032
2326
  }
1033
2327
  }
1034
2328
  }
1035
2329
 
1036
- void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1037
- const int qk = QK_K;
1038
- const int nb = n / qk;
1039
- const int ncols_interleaved = 8;
1040
- const int blocklen = 8;
1041
- static const uint32_t kmask1 = 0x3f3f3f3f;
1042
- static const uint32_t kmask2 = 0x0f0f0f0f;
1043
- static const uint32_t kmask3 = 0x03030303;
1044
2330
 
1045
- assert (n % qk == 0);
1046
- assert (nr % 4 == 0);
1047
- assert (nc % ncols_interleaved == 0);
1048
2331
 
1049
- UNUSED(s);
1050
- UNUSED(bs);
1051
- UNUSED(vx);
1052
- UNUSED(vy);
1053
- UNUSED(nr);
1054
- UNUSED(nc);
1055
- UNUSED(nb);
1056
- UNUSED(ncols_interleaved);
1057
- UNUSED(blocklen);
2332
+ void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
2333
+ float * GGML_RESTRICT s,
2334
+ size_t bs,
2335
+ const void * GGML_RESTRICT vx,
2336
+ const void * GGML_RESTRICT vy,
2337
+ int nr,
2338
+ int nc) {
2339
+ const int qk = QK8_0;
2340
+ const int nb = n / qk;
2341
+ const int ncols_interleaved = 4;
2342
+ const int blocklen = 8;
1058
2343
 
1059
- float sumf[4][8];
1060
- float sum_minf[4][8];
1061
- uint32_t utmp[32];
1062
- int sumi1;
1063
- int sumi2;
1064
- int sumi;
2344
+ assert(n % qk == 0);
2345
+ assert(nr % 4 == 0);
2346
+ assert(nc % ncols_interleaved == 0);
2347
+
2348
+ float sumf[4][4];
2349
+ int sumi;
1065
2350
 
1066
2351
  for (int y = 0; y < nr / 4; y++) {
1067
- const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2352
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1068
2353
  for (int x = 0; x < nc / ncols_interleaved; x++) {
1069
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
2354
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1070
2355
  for (int m = 0; m < 4; m++) {
1071
2356
  for (int j = 0; j < ncols_interleaved; j++) {
1072
2357
  sumf[m][j] = 0.0;
1073
- sum_minf[m][j] = 0.0;
1074
2358
  }
1075
2359
  }
1076
2360
  for (int l = 0; l < nb; l++) {
1077
- for (int sb = 0; sb < 8; sb++) {
1078
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
1079
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
1080
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
1081
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
1082
- utmp[sb * 4 + 2] = uaux_0;
1083
- utmp[sb * 4 + 0] &= kmask1;
1084
- }
1085
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1086
- uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
1087
- uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
2361
+ for (int k = 0; k < (qk / blocklen); k++) {
1088
2362
  for (int m = 0; m < 4; m++) {
1089
2363
  for (int j = 0; j < ncols_interleaved; j++) {
1090
- sumi1 = 0;
1091
- sumi2 = 0;
1092
2364
  sumi = 0;
1093
2365
  for (int i = 0; i < blocklen; ++i) {
1094
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1095
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1096
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
1097
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
1098
- sumi1 = sumi1 * scales_0[j];
1099
- sumi2 = sumi2 * scales_1[j];
1100
- sumi += sumi1 + sumi2;
2366
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
2367
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1101
2368
  }
1102
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1103
- }
1104
- }
1105
- }
1106
- for (int sb = 0; sb < 8; sb++) {
1107
- uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1108
- for(int m = 0; m < 4; m++) {
1109
- const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1110
- for(int j = 0; j < ncols_interleaved; j++) {
1111
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2369
+ sumf[m][j] +=
2370
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1112
2371
  }
1113
2372
  }
1114
2373
  }
1115
2374
  }
1116
2375
  for (int m = 0; m < 4; m++) {
1117
2376
  for (int j = 0; j < ncols_interleaved; j++) {
1118
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
2377
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1119
2378
  }
1120
2379
  }
1121
2380
  }
1122
2381
  }
1123
2382
  }
1124
2383
 
1125
- void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1126
- const int qk = QK_K;
2384
+ #if defined __riscv_zvfh
2385
+ void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2386
+ const int qk = QK8_0;
1127
2387
  const int nb = n / qk;
1128
- const int ncols_interleaved = 8;
1129
- const int blocklen = 8;
2388
+ const int ncols_interleaved = 16;
2389
+ const int blocklen = 1;
1130
2390
 
1131
2391
  assert (n % qk == 0);
1132
2392
  assert (nr % 4 == 0);
@@ -1142,82 +2402,45 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
1142
2402
  UNUSED(ncols_interleaved);
1143
2403
  UNUSED(blocklen);
1144
2404
 
1145
- float sumf[4][8];
1146
- float sum_minf[4][8];
1147
- int sumi1, sumi2, sumi3, sumi4;
2405
+ float sumf[4][16];
1148
2406
  int sumi;
1149
2407
 
1150
2408
  for (int y = 0; y < nr / 4; y++) {
1151
- const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2409
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1152
2410
  for (int x = 0; x < nc / ncols_interleaved; x++) {
1153
- const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
2411
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
1154
2412
  for (int m = 0; m < 4; m++) {
1155
- for (int j = 0; j < ncols_interleaved; j++) {
1156
- sumf[m][j] = 0.0;
1157
- sum_minf[m][j] = 0.0;
1158
- }
2413
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1159
2414
  }
1160
2415
  for (int l = 0; l < nb; l++) {
1161
- for (int k = 0; k < (qk / (4 * blocklen)); k++) {
1162
-
1163
- const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
1164
- const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
1165
- const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
1166
- const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
2416
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1167
2417
  for (int m = 0; m < 4; m++) {
1168
2418
  for (int j = 0; j < ncols_interleaved; j++) {
1169
- sumi1 = 0;
1170
- sumi2 = 0;
1171
- sumi3 = 0;
1172
- sumi4 = 0;
1173
2419
  sumi = 0;
1174
- int offset = ((k / 2) % 2) + j * 2;
1175
- for (int i = 0; i < blocklen; ++i){
1176
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
1177
- const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
1178
- const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
1179
- const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
1180
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
1181
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
1182
- sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
1183
- sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
1184
- sumi1 = sumi1 * (scales_0[offset] & 0xF);
1185
- sumi2 = sumi2 * (scales_1[offset] & 0xF);
1186
- sumi3 = sumi3 * (scales_2[offset] & 0xF);
1187
- sumi4 = sumi4 * (scales_3[offset] & 0xF);
1188
- sumi += sumi1 + sumi2 + sumi3 + sumi4;
2420
+ for (int i = 0; i < blocklen; ++i) {
2421
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
2422
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2423
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2424
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1189
2425
  }
1190
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1191
- }
1192
- }
1193
- }
1194
- for(int sb = 0; sb < 8; sb++) {
1195
- const uint8_t *mins = b_ptr[l].scales + sb * 16;
1196
- for(int m = 0; m < 4; m++) {
1197
- const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1198
- for(int j = 0; j < ncols_interleaved; j++) {
1199
- int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
1200
- sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2426
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1201
2427
  }
1202
2428
  }
1203
2429
  }
1204
2430
  }
1205
-
1206
2431
  for (int m = 0; m < 4; m++) {
1207
- for (int j = 0; j < ncols_interleaved; j++) {
1208
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1209
- }
2432
+ for (int j = 0; j < ncols_interleaved; j++)
2433
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1210
2434
  }
1211
2435
  }
1212
2436
  }
1213
2437
  }
1214
2438
 
1215
-
1216
- void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1217
- const int qk = QK8_0;
2439
+ void ggml_gemm_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2440
+ const int qk = QK_K;
1218
2441
  const int nb = n / qk;
1219
- const int ncols_interleaved = 4;
1220
- const int blocklen = 4;
2442
+ const int ncols_interleaved = 16;
2443
+ const int blocklen = 1;
1221
2444
 
1222
2445
  assert (n % qk == 0);
1223
2446
  assert (nr % 4 == 0);
@@ -1233,59 +2456,97 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
1233
2456
  UNUSED(ncols_interleaved);
1234
2457
  UNUSED(blocklen);
1235
2458
 
1236
- {
1237
- float sumf[4][4];
1238
- int sumi;
2459
+ float sumf[4][16];
2460
+ float sum_minf[4][16];
2461
+ uint8_t scales[128];
2462
+ uint8_t mins[128];
2463
+ int sumi1;
2464
+ int sumi2;
2465
+ int sumi;
2466
+
2467
+ for (int y = 0; y < nr / 4; y++) {
2468
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2469
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2470
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
2471
+ for (int m = 0; m < 4; m++) {
2472
+ for (int j = 0; j < ncols_interleaved; j++) {
2473
+ sumf[m][j] = 0.0;
2474
+ sum_minf[m][j] = 0.0;
2475
+ }
2476
+ }
2477
+ for (int l = 0; l < nb; l++) {
2478
+ for (int i = 0; i < 128; i++) {
2479
+ scales[i] = b_ptr[l].scales[i] & 0x0F;
2480
+ mins[i] = b_ptr[l].scales[i] >> 4;
2481
+ }
2482
+ for (int i = 0; i < 64; i++) {
2483
+ scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
2484
+ mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
2485
+ scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
2486
+ mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
2487
+ }
1239
2488
 
1240
- for (int y = 0; y < nr / 4; y++) {
1241
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1242
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1243
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
1244
- for (int m = 0; m < 4; m++) {
1245
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2489
+ for (int sb = 0; sb < 8; sb++) {
2490
+ uint8_t *min = &mins[sb * 16];
2491
+ for(int m = 0; m < 4; m++) {
2492
+ const int16_t bsums = a_ptr[l].bsums[sb * 8 + m] + a_ptr[l].bsums[sb * 8 + m + 4];
2493
+ for(int j = 0; j < ncols_interleaved; j++) {
2494
+ sum_minf[m][j] += min[j] * bsums * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2495
+ }
2496
+ }
1246
2497
  }
1247
- for (int l = 0; l < nb; l++) {
1248
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2498
+
2499
+ for (int sb = 0; sb < 8; sb += 2) {
2500
+ uint8_t *scales_0 = &scales[sb * 16];
2501
+ uint8_t *scales_1 = &scales[(sb + 1) * 16];
2502
+
2503
+ for (int i = 0; i < QK4_0; i++) {
1249
2504
  for (int m = 0; m < 4; m++) {
1250
2505
  for (int j = 0; j < ncols_interleaved; j++) {
2506
+ sumi1 = 0;
2507
+ sumi2 = 0;
1251
2508
  sumi = 0;
1252
- for (int i = 0; i < blocklen; ++i) {
1253
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1254
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1255
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1256
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
1257
- }
1258
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2509
+
2510
+ const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
2511
+ const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
2512
+ sumi1 = (v0 * a_ptr[l].qs[sb * 4 * 32 + i * 4 + m]);
2513
+ sumi2 = (v1 * a_ptr[l].qs[sb * 4 * 32 + 32 * 4 + i * 4 + m]);
2514
+ sumi1 = sumi1 * scales_0[j];
2515
+ sumi2 = sumi2 * scales_1[j];
2516
+ sumi += sumi1 + sumi2;
2517
+
2518
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1259
2519
  }
1260
2520
  }
1261
2521
  }
1262
2522
  }
1263
- for (int m = 0; m < 4; m++) {
1264
- for (int j = 0; j < ncols_interleaved; j++)
1265
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2523
+ }
2524
+ for (int m = 0; m < 4; m++) {
2525
+ for (int j = 0; j < ncols_interleaved; j++) {
2526
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1266
2527
  }
1267
2528
  }
1268
2529
  }
1269
2530
  }
1270
2531
  }
1271
2532
 
1272
- void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2533
+ void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1273
2534
  const int qk = QK8_0;
1274
2535
  const int nb = n / qk;
1275
- const int ncols_interleaved = 8;
1276
- const int blocklen = 8;
2536
+ const int ncols_interleaved = 16;
2537
+ const int blocklen = 1;
1277
2538
 
1278
2539
  assert(n % qk == 0);
1279
2540
  assert(nr % 4 == 0);
1280
2541
  assert(nc % ncols_interleaved == 0);
1281
2542
 
1282
- float sumf[4][8];
2543
+ float sumf[4][16];
1283
2544
  int sumi;
1284
2545
 
1285
2546
  for (int y = 0; y < nr / 4; y++) {
1286
2547
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1287
2548
  for (int x = 0; x < nc / ncols_interleaved; x++) {
1288
- const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
2549
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
1289
2550
  for (int m = 0; m < 4; m++) {
1290
2551
  for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1291
2552
  }
@@ -1298,7 +2559,7 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
1298
2559
  const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1299
2560
  const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1300
2561
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1301
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2562
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + (qk / 2) * 4]));
1302
2563
  }
1303
2564
  sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1304
2565
  }
@@ -1313,29 +2574,23 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
1313
2574
  }
1314
2575
  }
1315
2576
 
1316
- void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
1317
- float * GGML_RESTRICT s,
1318
- size_t bs,
1319
- const void * GGML_RESTRICT vx,
1320
- const void * GGML_RESTRICT vy,
1321
- int nr,
1322
- int nc) {
2577
+ void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1323
2578
  const int qk = QK8_0;
1324
2579
  const int nb = n / qk;
1325
- const int ncols_interleaved = 4;
1326
- const int blocklen = 4;
2580
+ const int ncols_interleaved = 16;
2581
+ const int blocklen = 1;
1327
2582
 
1328
2583
  assert(n % qk == 0);
1329
2584
  assert(nr % 4 == 0);
1330
2585
  assert(nc % ncols_interleaved == 0);
1331
2586
 
1332
- float sumf[4][4];
2587
+ float sumf[4][16];
1333
2588
  int sumi;
1334
2589
 
1335
2590
  for (int y = 0; y < nr / 4; y++) {
1336
2591
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1337
2592
  for (int x = 0; x < nc / ncols_interleaved; x++) {
1338
- const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
2593
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
1339
2594
  for (int m = 0; m < 4; m++) {
1340
2595
  for (int j = 0; j < ncols_interleaved; j++) {
1341
2596
  sumf[m][j] = 0.0;
@@ -1365,57 +2620,102 @@ void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
1365
2620
  }
1366
2621
  }
1367
2622
 
1368
- void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
1369
- float * GGML_RESTRICT s,
1370
- size_t bs,
1371
- const void * GGML_RESTRICT vx,
1372
- const void * GGML_RESTRICT vy,
1373
- int nr,
1374
- int nc) {
1375
- const int qk = QK8_0;
1376
- const int nb = n / qk;
1377
- const int ncols_interleaved = 4;
1378
- const int blocklen = 8;
1379
2623
 
1380
- assert(n % qk == 0);
2624
+ void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2625
+ assert(n % QK_K == 0);
1381
2626
  assert(nr % 4 == 0);
1382
- assert(nc % ncols_interleaved == 0);
2627
+ assert(nc % 16 == 0);
2628
+ const int nb = n / QK_K;
2629
+ const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
2630
+ const block_q8_Kx4 * y = (const block_q8_Kx4 *)vy;
2631
+
2632
+ const int sb_perm[16] = {
2633
+ 0, 4, 1, 5, 2, 6, 3, 7,
2634
+ 8, 12, 9, 13, 10, 14, 11, 15
2635
+ };
1383
2636
 
1384
- float sumf[4][4];
1385
- int sumi;
2637
+ // Iterate Rows in tiles of 4
2638
+ for (int row_tile = 0; row_tile < nr; row_tile += 4) {
2639
+ // Iterate Columns in tiles of 16
2640
+ for (int col_tile = 0; col_tile < nc; col_tile += 16) {
2641
+
2642
+ const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
2643
+ const block_q8_Kx4 * y_ptr = y + (row_tile / 4) * nb;
2644
+
2645
+ float sumf[4][16];
2646
+ memset(sumf, 0, sizeof(sumf));
2647
+
2648
+ for (int k_block = 0; k_block < nb; ++k_block) {
2649
+ int32_t isum[4][16];
2650
+ int32_t summs[4][16];
2651
+ memset(isum, 0, sizeof(isum));
2652
+ memset(summs, 0, sizeof(summs));
2653
+
2654
+ const uint8_t * qs_rhs = x_ptr[k_block].qs;
2655
+ const uint8_t * sc_rhs = x_ptr[k_block].scales;
2656
+ const int8_t * qs_lhs = y_ptr[k_block].qs;
2657
+ const int16_t * bs_lhs = y_ptr[k_block].bsums;
2658
+
2659
+ for (int sb = 0; sb < 16; ++sb) {
2660
+ int scale_offset = sb_perm[sb] * 16;
2661
+
2662
+ int byte_base;
2663
+ if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
2664
+ else byte_base = (sb % 2 == 0) ? 32 : 48;
2665
+ int shift = ((sb / 2) % 4) * 2;
2666
+
2667
+ for (int col = 0; col < 16; ++col) {
2668
+ uint8_t sc_val = sc_rhs[scale_offset + col];
2669
+ int32_t d_sb = sc_val & 0xF;
2670
+ int32_t m_sb = sc_val >> 4;
2671
+
2672
+ // Correction Term
2673
+ for (int r = 0; r < 4; ++r) {
2674
+ int bsum_idx = (sb / 4) * 16 + r * 4 + (sb % 4);
2675
+ summs[r][col] += bs_lhs[bsum_idx] * m_sb;
2676
+ }
1386
2677
 
1387
- for (int y = 0; y < nr / 4; y++) {
1388
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1389
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1390
- const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1391
- for (int m = 0; m < 4; m++) {
1392
- for (int j = 0; j < ncols_interleaved; j++) {
1393
- sumf[m][j] = 0.0;
1394
- }
1395
- }
1396
- for (int l = 0; l < nb; l++) {
1397
- for (int k = 0; k < (qk / blocklen); k++) {
1398
- for (int m = 0; m < 4; m++) {
1399
- for (int j = 0; j < ncols_interleaved; j++) {
1400
- sumi = 0;
1401
- for (int i = 0; i < blocklen; ++i) {
1402
- const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1403
- sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
2678
+ // Main Dot Product
2679
+ for (int l = 0; l < 16; ++l) {
2680
+ int qs_idx = (byte_base + l) * 16 + col;
2681
+ uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
2682
+
2683
+ // Calculate Q8 index for this specific k and row
2684
+ int k = sb * 16 + l;
2685
+ int q8_idx = (k / 4) * 16 + (k % 4);
2686
+
2687
+ for (int r = 0; r < 4; ++r) {
2688
+ // Add r*4 to jump to the correct row within the 4x4 chunk
2689
+ int8_t q8_val = qs_lhs[q8_idx + r * 4];
2690
+ isum[r][col] += q8_val * q2_val * d_sb;
1404
2691
  }
1405
- sumf[m][j] +=
1406
- sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1407
2692
  }
1408
2693
  }
1409
2694
  }
2695
+
2696
+ // Finalize K-Block
2697
+ for (int col = 0; col < 16; ++col) {
2698
+ float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
2699
+ float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
2700
+
2701
+ for (int r = 0; r < 4; ++r) {
2702
+ float d_lhs = y_ptr[k_block].d[r];
2703
+ float d_all = d_lhs * d_rhs;
2704
+ float d_min = d_lhs * dm_rhs;
2705
+ sumf[r][col] += (isum[r][col] * d_all) - (summs[r][col] * d_min);
2706
+ }
2707
+ }
1410
2708
  }
1411
- for (int m = 0; m < 4; m++) {
1412
- for (int j = 0; j < ncols_interleaved; j++) {
1413
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2709
+
2710
+ for (int r = 0; r < 4; ++r) {
2711
+ for (int col = 0; col < 16; ++col) {
2712
+ s[(row_tile + r) * bs + (col_tile + col)] = sumf[r][col];
1414
2713
  }
1415
2714
  }
1416
2715
  }
1417
2716
  }
1418
2717
  }
2718
+ #endif
1419
2719
 
1420
2720
  } // extern "C"
1421
2721
 
@@ -1498,16 +2798,212 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in
1498
2798
 
1499
2799
  uint64_t elems;
1500
2800
  memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1501
- elems ^= xor_mask;
2801
+ elems ^= xor_mask;
2802
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
2803
+ }
2804
+
2805
+ return out;
2806
+ }
2807
+
2808
+ static block_q4_0x16 make_block_q4_0x16(block_q4_0 * in, unsigned int blck_size_interleave) {
2809
+ block_q4_0x16 out;
2810
+
2811
+ for (int i = 0; i < 16; i++) {
2812
+ out.d[i] = in[i].d;
2813
+ }
2814
+
2815
+ const int end = QK4_0 * 8 / blck_size_interleave;
2816
+
2817
+ if (blck_size_interleave == 1) {
2818
+ const uint8_t xor_mask = 0x88;
2819
+ for (int i = 0; i < end; ++i) {
2820
+ int src_id = i % 16;
2821
+ int src_offset = i / 16;
2822
+ int dst_offset = i;
2823
+
2824
+ out.qs[dst_offset] = in[src_id].qs[src_offset] ^ xor_mask;
2825
+ }
2826
+ } else {
2827
+ GGML_ASSERT(false);
2828
+ }
2829
+
2830
+ return out;
2831
+ }
2832
+
2833
+ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
2834
+ block_q4_Kx8 out;
2835
+ //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
2836
+ for (int i = 0; i < 8; i++) {
2837
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
2838
+ }
2839
+
2840
+ for (int i = 0; i < 8; i++) {
2841
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
2842
+ }
2843
+
2844
+ const int end = QK_K * 4 / blck_size_interleave;
2845
+
2846
+ // Interleave Q4_K quants by taking 8 bytes at a time
2847
+ for (int i = 0; i < end; ++i) {
2848
+ int src_id = i % 8;
2849
+ int src_offset = (i / 8) * blck_size_interleave;
2850
+ int dst_offset = i * blck_size_interleave;
2851
+
2852
+ // buffer large enough for the max interleave block size (8 bytes)
2853
+ uint64_t elems;
2854
+ memcpy(&elems, &in[src_id].qs[src_offset], blck_size_interleave);
2855
+ memcpy(&out.qs[dst_offset], &elems, blck_size_interleave);
2856
+ }
2857
+
2858
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
2859
+ // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
2860
+ // The output Q4_Kx8 structure has 96 bytes
2861
+ // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
2862
+ // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
2863
+ uint8_t s[8], m[8];
2864
+
2865
+ for (int i = 0; i < 4; i++) {
2866
+ for (int j = 0; j < 8; j++) {
2867
+ s[j] = in[j].scales[i] & 63;
2868
+ m[j] = in[j].scales[i + 4] & 63;
2869
+ }
2870
+
2871
+ out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
2872
+ out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
2873
+ out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
2874
+ out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
2875
+ out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
2876
+ out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
2877
+ out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
2878
+ out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
2879
+ out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
2880
+ out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
2881
+ out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
2882
+ out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
2883
+
2884
+ }
2885
+
2886
+ for (int i = 0; i < 4; i++) {
2887
+ for (int j = 0; j < 8; j++) {
2888
+ s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
2889
+ m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
2890
+ }
2891
+
2892
+ out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
2893
+ out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
2894
+ out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
2895
+ out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
2896
+ out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
2897
+ out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
2898
+ out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
2899
+ out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
2900
+ out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
2901
+ out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
2902
+ out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
2903
+ out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
2904
+
2905
+ }
2906
+
2907
+ return out;
2908
+ }
2909
+
2910
+ static block_q4_Kx16 make_block_q4_Kx16(block_q4_K * in, unsigned int blck_size_interleave) {
2911
+ block_q4_Kx16 out;
2912
+ //Delta(scale) and dmin values of the 16 Q4_K structures are copied onto the output interleaved structure
2913
+ for (int i = 0; i < 16; i++) {
2914
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
2915
+ }
2916
+
2917
+ for (int i = 0; i < 16; i++) {
2918
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
2919
+ }
2920
+
2921
+ const int end = QK_K * 8 / blck_size_interleave;
2922
+
2923
+ if (blck_size_interleave == 1) {
2924
+ for (int i = 0; i < end; ++i) {
2925
+ int src_id = i % 16;
2926
+ int src_offset = i / 16;
2927
+ int dst_offset = i;
2928
+
2929
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
2930
+ }
2931
+
2932
+ // RVV repacking.
2933
+ //
2934
+ // Extract sums and mins for all 8 sub-blocks for each block of Q4_K.
2935
+ uint8_t s[128], m[128];
2936
+ for (int i = 0; i < 4; i++) {
2937
+ for (int j = 0; j < 16; j++) {
2938
+ s[i * 16 + j] = in[j].scales[i] & 63;
2939
+ m[i * 16 + j] = in[j].scales[i + 4] & 63;
2940
+ }
2941
+ }
2942
+ for (int i = 0; i < 4; i++) {
2943
+ for (int j = 0; j < 16; j++) {
2944
+ s[64 + i * 16 + j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
2945
+ m[64 + i * 16 + j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
2946
+ }
2947
+ }
2948
+
2949
+ for (int i = 0; i < 128; i++) {
2950
+ out.scales[i] = (s[i] & 15) | ((m[i] & 15) << 4);
2951
+ }
2952
+ for (int i = 0; i < 64; i++) {
2953
+ out.scales[128 + i] = ((s[i] & 48) >> 4) | ((m[i] & 48) >> 2) | (s[64 + i] & 48) | ((m[64 + i] & 48) << 2);
2954
+ }
2955
+ } else {
2956
+ GGML_ASSERT(false);
2957
+ }
2958
+
2959
+ return out;
2960
+ }
2961
+
2962
+ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
2963
+ block_q2_Kx8 out;
2964
+
2965
+ // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
2966
+ for (int i = 0; i < 8; i++) {
2967
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
2968
+ }
2969
+
2970
+ for (int i = 0; i < 8; i++) {
2971
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
2972
+ }
2973
+
2974
+ const int end = QK_K * 2 / blck_size_interleave;
2975
+
2976
+ // Interleave Q2_K quants by taking 8 bytes at a time
2977
+ for (int i = 0; i < end; ++i) {
2978
+ int src_id = i % 8;
2979
+ int src_offset = (i / 8) * blck_size_interleave;
2980
+ int dst_offset = i * blck_size_interleave;
2981
+
2982
+ uint64_t elems;
2983
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1502
2984
  memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1503
2985
  }
1504
2986
 
2987
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
2988
+ // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
2989
+ // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
2990
+ // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
2991
+ // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
2992
+
2993
+ for (int i = 0; i < 128; i++) {
2994
+ // Index for selecting which q2k super block
2995
+ int src1 = (i % 16) / 2;
2996
+ // Index for selecting scale
2997
+ int src2 = ((i / 16) * 2) + (i % 2);
2998
+
2999
+ out.scales[i] = in[src1].scales[src2];
3000
+ }
1505
3001
  return out;
1506
3002
  }
1507
3003
 
1508
- static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
1509
- block_q4_Kx8 out;
1510
- //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
3004
+ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_interleave) {
3005
+ block_q5_Kx8 out;
3006
+ //Delta(scale) and dmin values of the eight Q5_K structures are copied onto the output interleaved structure
1511
3007
  for (int i = 0; i < 8; i++) {
1512
3008
  out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1513
3009
  }
@@ -1518,22 +3014,33 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
1518
3014
 
1519
3015
  const int end = QK_K * 4 / blck_size_interleave;
1520
3016
 
1521
- // Interleave Q4_K quants by taking 8 bytes at a time
3017
+ // Interleave Q5_K quants by taking blck_size_interleave bytes at a time
1522
3018
  for (int i = 0; i < end; ++i) {
1523
- int src_id = i % 8;
3019
+ int src_id = i % 8;
1524
3020
  int src_offset = (i / 8) * blck_size_interleave;
1525
3021
  int dst_offset = i * blck_size_interleave;
1526
3022
 
1527
- uint64_t elems;
1528
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1529
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
3023
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
1530
3024
  }
1531
3025
 
1532
- // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
1533
- // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
1534
- // The output Q4_Kx8 structure has 96 bytes
1535
- // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
1536
- // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
3026
+ // Repeat for high bits with the same chunk size, since
3027
+ // the high bits are interleaved in Q5_K and the index is
3028
+ // qh_idx = (qs_idx % 32);
3029
+ // qh_val = qh[qh_idx] >> (qs_idx / 32);
3030
+ for (int i = 0; i < end / 4; ++i) {
3031
+ int src_id = i % 8;
3032
+ int src_offset = (i / 8) * blck_size_interleave;
3033
+ int dst_offset = i * blck_size_interleave;
3034
+
3035
+ memcpy(&out.qh[dst_offset], &in[src_id].qh[src_offset], blck_size_interleave);
3036
+ }
3037
+
3038
+ // The below logic is copied over from Q4_K
3039
+ // The point is to unpack all the scales and mins for each sub block every time we load 12 bytes.
3040
+ // Currently the Q5_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
3041
+ // The output Q5_Kx8 structure has 96 bytes
3042
+ // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q5_K structure
3043
+ // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q5_K structures
1537
3044
  uint8_t s[8], m[8];
1538
3045
 
1539
3046
  for (int i = 0; i < 4; i++) {
@@ -1554,13 +3061,12 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
1554
3061
  out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
1555
3062
  out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
1556
3063
  out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
1557
-
1558
3064
  }
1559
3065
 
1560
3066
  for (int i = 0; i < 4; i++) {
1561
3067
  for (int j = 0; j < 8; j++) {
1562
- s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
1563
- m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
3068
+ s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i + 8] & 15);
3069
+ m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i + 8] & 240) >> 4);
1564
3070
  }
1565
3071
 
1566
3072
  out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
@@ -1575,54 +3081,117 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
1575
3081
  out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
1576
3082
  out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
1577
3083
  out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
1578
-
1579
3084
  }
1580
3085
 
1581
3086
  return out;
1582
3087
  }
1583
3088
 
1584
- static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
1585
- block_q2_Kx8 out;
3089
+ static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_interleave) {
3090
+ block_q6_Kx8 out;
3091
+ constexpr int n_blocks = 8; // Kx8
3092
+ for (int i = 0; i < n_blocks; i++) {
3093
+ out.d[i] = in[i].d;
3094
+ }
1586
3095
 
1587
- // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
1588
- for (int i = 0; i < 8; i++) {
1589
- out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
3096
+ const int end_ls = QK_K * 4 / blck_size_interleave;
3097
+ // Interleave Q6_K quants by taking blck_size_interleave bytes at a time
3098
+ for (int i = 0; i < end_ls; ++i) {
3099
+ int src_id = i % n_blocks;
3100
+ int src_offset = (i / n_blocks) * blck_size_interleave;
3101
+ int dst_offset = i * blck_size_interleave;
3102
+
3103
+ uint64_t elem_ls;
3104
+ memcpy(&elem_ls, &in[src_id].ql[src_offset], blck_size_interleave);
3105
+ memcpy(&out.ql[dst_offset], &elem_ls, blck_size_interleave);
1590
3106
  }
1591
3107
 
1592
- for (int i = 0; i < 8; i++) {
3108
+ // Interleave high bits using same chunk size as low bits
3109
+ const int end_hs = end_ls / 2;
3110
+ for (int i = 0; i < end_hs; ++i) {
3111
+ int src_id = i % n_blocks;
3112
+ int src_offset = (i / n_blocks) * blck_size_interleave;
3113
+ int dst_offset = i * blck_size_interleave;
3114
+
3115
+ uint64_t elem_hs;
3116
+ memcpy(&elem_hs, &in[src_id].qh[src_offset], blck_size_interleave);
3117
+ memcpy(&out.qh[dst_offset], &elem_hs, blck_size_interleave);
3118
+ }
3119
+
3120
+ // The below logic is designed so as to unpack and rearrange scales in Q6_K
3121
+ // The output Q6_Kx8 structure interleaves the 8 bit scales in the same fashion as the quants
3122
+ // Q6_K structure has an 8-bit scale per 16 elements -> 16 scales
3123
+ // scales: [0 bl0 0 bl1 ... 0 bl7][1 bl0 ... 1 bl7] ... [15 bl0 ... 15 bl7] (bl = block)
3124
+ constexpr int n_scales = QK_K / 16;
3125
+
3126
+ for (int i = 0; i < n_blocks; i++) {
3127
+ for (int j = 0; j < n_scales; j++) {
3128
+ out.scales[j * n_blocks + i] = in[i].scales[j];
3129
+ }
3130
+ }
3131
+
3132
+ return out;
3133
+ }
3134
+
3135
+ static block_q2_Kx16 make_block_q2_Kx16(const block_q2_K * in, unsigned int blck_size_interleave) {
3136
+ block_q2_Kx16 out;
3137
+ constexpr int N_COLS = 16;
3138
+
3139
+ // 1. Copy Super-Scales (d) and Super-Mins (dmin)
3140
+ for (int i = 0; i < N_COLS; i++) {
3141
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1593
3142
  out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
1594
3143
  }
1595
3144
 
1596
- const int end = QK_K * 2 / blck_size_interleave;
3145
+ // 2. Interleave Q2_K Data
3146
+ const int bytes_per_col = 64;
3147
+ const int total_bytes = N_COLS * bytes_per_col;
3148
+ const int end = total_bytes / blck_size_interleave;
1597
3149
 
1598
- // Interleave Q2_K quants by taking 8 bytes at a time
1599
3150
  for (int i = 0; i < end; ++i) {
1600
- int src_id = i % 8;
1601
- int src_offset = (i / 8) * blck_size_interleave;
3151
+ int src_col_id = i % N_COLS;
3152
+ int src_offset = (i / N_COLS) * blck_size_interleave;
1602
3153
  int dst_offset = i * blck_size_interleave;
1603
-
1604
- uint64_t elems;
1605
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1606
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
3154
+ memcpy(&out.qs[dst_offset], &in[src_col_id].qs[src_offset], blck_size_interleave);
1607
3155
  }
1608
3156
 
1609
- // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
1610
- // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
1611
- // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
1612
- // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
1613
- // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
3157
+ // 3. Repack Scales into the Optimized "Sequential-Parallel" Layout
3158
+ int out_idx = 0;
1614
3159
 
1615
- for(int i = 0; i < 128; i++){
3160
+ // Arrays define the sub-block order for each group
3161
+ const int even_low_sbs[] = {0, 2, 4, 6};
3162
+ const int odd_low_sbs[] = {1, 3, 5, 7};
3163
+ const int even_high_sbs[] = {8, 10, 12, 14};
3164
+ const int odd_high_sbs[] = {9, 11, 13, 15};
1616
3165
 
1617
- // Index for selecting which q2k super block
1618
- int src1 = (i % 16) / 2;
1619
- // Index for selecting scale
1620
- int src2 = ((i / 16) * 2) + (i % 2);
3166
+ // Pack Group 1: Even-Low
3167
+ for (int sb : even_low_sbs) {
3168
+ for (int col = 0; col < N_COLS; col++) {
3169
+ out.scales[out_idx++] = in[col].scales[sb];
3170
+ }
3171
+ }
1621
3172
 
1622
- out.scales[i] = in[src1].scales[src2];
3173
+ // Pack Group 2: Odd-Low
3174
+ for (int sb : odd_low_sbs) {
3175
+ for (int col = 0; col < N_COLS; col++) {
3176
+ out.scales[out_idx++] = in[col].scales[sb];
3177
+ }
3178
+ }
3179
+
3180
+ // Pack Group 3: Even-High
3181
+ for (int sb : even_high_sbs) {
3182
+ for (int col = 0; col < N_COLS; col++) {
3183
+ out.scales[out_idx++] = in[col].scales[sb];
3184
+ }
1623
3185
  }
1624
- return out;
1625
3186
 
3187
+ // Pack Group 4: Odd-High
3188
+ for (int sb : odd_high_sbs) {
3189
+ for (int col = 0; col < N_COLS; col++) {
3190
+ out.scales[out_idx++] = in[col].scales[sb];
3191
+ }
3192
+ }
3193
+
3194
+ return out;
1626
3195
  }
1627
3196
 
1628
3197
  static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
@@ -1687,6 +3256,36 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
1687
3256
  GGML_UNUSED(data_size);
1688
3257
  }
1689
3258
 
3259
+ static int repack_q4_K_to_q4_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3260
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
3261
+ constexpr int nrows_interleaved = 16;
3262
+
3263
+ block_q4_Kx16 * dst = (block_q4_Kx16*)t->data;
3264
+ const block_q4_K * src = (const block_q4_K*) data;
3265
+ block_q4_K dst_tmp[16];
3266
+ int nrow = ggml_nrows(t);
3267
+ int nblocks = t->ne[0] / QK_K;
3268
+
3269
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
3270
+
3271
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3272
+ return -1;
3273
+ }
3274
+
3275
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3276
+ for (int64_t x = 0; x < nblocks; x++) {
3277
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3278
+ dst_tmp[i] = src[x + i * nblocks];
3279
+ }
3280
+ *dst++ = make_block_q4_Kx16(dst_tmp, interleave_block);
3281
+ }
3282
+ src += nrows_interleaved * nblocks;
3283
+ }
3284
+ return 0;
3285
+
3286
+ GGML_UNUSED(data_size);
3287
+ }
3288
+
1690
3289
  static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1691
3290
  GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
1692
3291
  GGML_ASSERT(interleave_block == 8);
@@ -1706,7 +3305,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
1706
3305
 
1707
3306
  for (int b = 0; b < nrow; b += nrows_interleaved) {
1708
3307
  for (int64_t x = 0; x < nblocks; x++) {
1709
- for (int i = 0; i < nrows_interleaved; i++ ) {
3308
+ for (int i = 0; i < nrows_interleaved; i++) {
1710
3309
  dst_tmp[i] = src[x + i * nblocks];
1711
3310
  }
1712
3311
  *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
@@ -1718,6 +3317,132 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
1718
3317
  GGML_UNUSED(data_size);
1719
3318
  }
1720
3319
 
3320
+ static int repack_q2_K_to_q2_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3321
+ GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
3322
+ constexpr int nrows_interleaved = 16;
3323
+
3324
+ block_q2_Kx16 * dst = (block_q2_Kx16*)t->data;
3325
+ const block_q2_K * src = (const block_q2_K*) data;
3326
+
3327
+ block_q2_K dst_tmp[nrows_interleaved];
3328
+
3329
+ int nrow = ggml_nrows(t);
3330
+ int nblocks = t->ne[0] / QK_K;
3331
+
3332
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
3333
+
3334
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3335
+ return -1;
3336
+ }
3337
+
3338
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3339
+ for (int64_t x = 0; x < nblocks; x++) {
3340
+ // This loop gathers 16 separate blocks (one from each column)
3341
+ // that correspond to the same K-dimension chunk.
3342
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3343
+ dst_tmp[i] = src[x + i * nblocks];
3344
+ }
3345
+
3346
+ *dst++ = make_block_q2_Kx16(dst_tmp, interleave_block);
3347
+ }
3348
+ src += nrows_interleaved * nblocks;
3349
+ }
3350
+ return 0;
3351
+
3352
+ GGML_UNUSED(data_size);
3353
+ }
3354
+
3355
+ static int repack_q4_0_to_q4_0_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3356
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
3357
+ constexpr int nrows_interleaved = 16;
3358
+
3359
+ block_q4_0x16 * dst = (block_q4_0x16*)t->data;
3360
+ const block_q4_0 * src = (const block_q4_0*) data;
3361
+ block_q4_0 dst_tmp[16];
3362
+ int nrow = ggml_nrows(t);
3363
+ int nblocks = t->ne[0] / QK4_0;
3364
+
3365
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
3366
+
3367
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3368
+ return -1;
3369
+ }
3370
+
3371
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3372
+ for (int64_t x = 0; x < nblocks; x++) {
3373
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3374
+ dst_tmp[i] = src[x + i * nblocks];
3375
+ }
3376
+ *dst++ = make_block_q4_0x16(dst_tmp, interleave_block);
3377
+ }
3378
+ src += nrows_interleaved * nblocks;
3379
+ }
3380
+ return 0;
3381
+
3382
+ GGML_UNUSED(data_size);
3383
+ }
3384
+
3385
+ static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
3386
+ int interleave_block,
3387
+ const void * GGML_RESTRICT data,
3388
+ size_t data_size) {
3389
+ GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
3390
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3391
+ constexpr int nrows_interleaved = 8;
3392
+
3393
+ block_q5_Kx8 * dst = (block_q5_Kx8 *) t->data;
3394
+ const block_q5_K * src = (const block_q5_K *) data;
3395
+ block_q5_K dst_tmp[8];
3396
+ int nrow = ggml_nrows(t);
3397
+ int nblocks = t->ne[0] / QK_K;
3398
+
3399
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q5_K));
3400
+
3401
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3402
+ return -1;
3403
+ }
3404
+
3405
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3406
+ for (int64_t x = 0; x < nblocks; x++) {
3407
+ for (int i = 0; i < nrows_interleaved; i++) {
3408
+ dst_tmp[i] = src[x + i * nblocks];
3409
+ }
3410
+ *dst++ = make_block_q5_Kx8(dst_tmp, interleave_block);
3411
+ }
3412
+ src += nrows_interleaved * nblocks;
3413
+ }
3414
+ return 0;
3415
+ }
3416
+
3417
+ static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3418
+ GGML_ASSERT(t->type == GGML_TYPE_Q6_K);
3419
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3420
+ constexpr int nrows_interleaved = 8;
3421
+
3422
+ block_q6_Kx8 * dst = (block_q6_Kx8 *)t->data;
3423
+ const block_q6_K * src = (const block_q6_K *) data;
3424
+ block_q6_K dst_tmp[8];
3425
+ int nrow = ggml_nrows(t);
3426
+ int nblocks = t->ne[0] / QK_K;
3427
+
3428
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q6_K));
3429
+
3430
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3431
+ return -1;
3432
+ }
3433
+
3434
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3435
+ for (int64_t x = 0; x < nblocks; x++) {
3436
+ for (int i = 0; i < nrows_interleaved; i++) {
3437
+ dst_tmp[i] = src[x + i * nblocks];
3438
+ }
3439
+ *dst++ = make_block_q6_Kx8(dst_tmp, interleave_block);
3440
+ }
3441
+ src += nrows_interleaved * nblocks;
3442
+ }
3443
+ return 0;
3444
+ }
3445
+
1721
3446
  static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1722
3447
  GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
1723
3448
  GGML_ASSERT(interleave_block == 8);
@@ -1757,9 +3482,63 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
1757
3482
  GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1758
3483
  constexpr int nrows_interleaved = 4;
1759
3484
 
1760
- block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
3485
+ block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
3486
+ const block_q8_0 * src = (const block_q8_0 *) data;
3487
+ block_q8_0 dst_tmp[4];
3488
+ int nrow = ggml_nrows(t);
3489
+ int nblocks = t->ne[0] / QK8_0;
3490
+
3491
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
3492
+
3493
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3494
+ return -1;
3495
+ }
3496
+
3497
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3498
+ for (int64_t x = 0; x < nblocks; x++) {
3499
+ for (int i = 0; i < nrows_interleaved; i++) {
3500
+ dst_tmp[i] = src[x + i * nblocks];
3501
+ }
3502
+ *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
3503
+ }
3504
+ src += nrows_interleaved * nblocks;
3505
+ }
3506
+ return 0;
3507
+ }
3508
+
3509
+ static block_q8_0x16 make_block_q8_0x16(block_q8_0 * in, unsigned int blck_size_interleave) {
3510
+ block_q8_0x16 out;
3511
+
3512
+ for (int i = 0; i < 16; i++) {
3513
+ out.d[i] = in[i].d;
3514
+ }
3515
+
3516
+ const int end = QK8_0 * 16 / blck_size_interleave;
3517
+
3518
+ if (blck_size_interleave == 1) {
3519
+ for (int i = 0; i < end; ++i) {
3520
+ int src_id = i % 16;
3521
+ int src_offset = i / 16;
3522
+ int dst_offset = i;
3523
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
3524
+ }
3525
+ } else {
3526
+ GGML_ASSERT(false);
3527
+ }
3528
+
3529
+ return out;
3530
+ }
3531
+
3532
+ static int repack_q8_0_to_q8_0_16_bl(struct ggml_tensor * t,
3533
+ int interleave_block,
3534
+ const void * GGML_RESTRICT data,
3535
+ size_t data_size) {
3536
+ GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
3537
+ constexpr int nrows_interleaved = 16;
3538
+
3539
+ block_q8_0x16 * dst = (block_q8_0x16 *) t->data;
1761
3540
  const block_q8_0 * src = (const block_q8_0 *) data;
1762
- block_q8_0 dst_tmp[4];
3541
+ block_q8_0 dst_tmp[16];
1763
3542
  int nrow = ggml_nrows(t);
1764
3543
  int nblocks = t->ne[0] / QK8_0;
1765
3544
 
@@ -1774,7 +3553,7 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
1774
3553
  for (int i = 0; i < nrows_interleaved; i++) {
1775
3554
  dst_tmp[i] = src[x + i * nblocks];
1776
3555
  }
1777
- *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
3556
+ *dst++ = make_block_q8_0x16(dst_tmp, interleave_block);
1778
3557
  }
1779
3558
  src += nrows_interleaved * nblocks;
1780
3559
  }
@@ -1906,6 +3685,177 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
1906
3685
  GGML_UNUSED(data_size);
1907
3686
  }
1908
3687
 
3688
+ static block_iq4_nlx16 make_block_iq4_nlx16(block_iq4_nl * in, unsigned int blck_size_interleave) {
3689
+ block_iq4_nlx16 out;
3690
+
3691
+ for (int i = 0; i < 16; i++) {
3692
+ out.d[i] = in[i].d;
3693
+ }
3694
+
3695
+ const int end = QK4_NL * 8 / blck_size_interleave;
3696
+
3697
+ if (blck_size_interleave == 1) {
3698
+ for (int i = 0; i < end; ++i) {
3699
+ int src_id = i % 16;
3700
+ int src_offset = i / 16;
3701
+ int dst_offset = i;
3702
+
3703
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
3704
+ }
3705
+ } else {
3706
+ GGML_ASSERT(false);
3707
+ }
3708
+
3709
+ return out;
3710
+ }
3711
+
3712
+ static int repack_iq4_nl_to_iq4_nl_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3713
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
3714
+ GGML_ASSERT(interleave_block == 1);
3715
+
3716
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
3717
+ block_iq4_nlx16 * dst = ( block_iq4_nlx16 *)t->data;
3718
+
3719
+ block_iq4_nl dst_tmp[16];
3720
+
3721
+ int nrow = ggml_nrows(t);
3722
+ int nrows_interleaved = 16;
3723
+ int nblocks = t->ne[0] / QK4_NL;
3724
+
3725
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
3726
+
3727
+ if (t->ne[1] % nrows_interleaved != 0) {
3728
+ return -1;
3729
+ }
3730
+
3731
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3732
+ for (int64_t x = 0; x < nblocks; x++) {
3733
+ for (int i = 0; i < nrows_interleaved; i++) {
3734
+ dst_tmp[i] = src[x + i * nblocks];
3735
+ }
3736
+ *dst++ = make_block_iq4_nlx16(dst_tmp, interleave_block);
3737
+ }
3738
+ src += nrows_interleaved * nblocks;
3739
+ }
3740
+ return 0;
3741
+
3742
+ GGML_UNUSED(data_size);
3743
+ }
3744
+
3745
+ static block_mxfp4x4 make_block_mxfp4x4(block_mxfp4 * in, unsigned int blck_size_interleave) {
3746
+ block_mxfp4x4 out;
3747
+
3748
+ for (int i = 0; i < 4; i++) {
3749
+ out.e[i] = in[i].e;
3750
+ }
3751
+
3752
+ const int end = QK_MXFP4 * 2 / blck_size_interleave;
3753
+
3754
+ if (blck_size_interleave == 4) {
3755
+ for (int i = 0; i < end; ++i) {
3756
+ int src_id = i % 4;
3757
+ int src_offset = (i / 4) * blck_size_interleave;
3758
+ int dst_offset = i * blck_size_interleave;
3759
+
3760
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
3761
+ }
3762
+ } else {
3763
+ GGML_ASSERT(false);
3764
+ }
3765
+
3766
+ return out;
3767
+ }
3768
+
3769
+ static int repack_mxfp4_to_mxfp4_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3770
+ GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
3771
+ GGML_ASSERT(interleave_block == 4);
3772
+
3773
+ const block_mxfp4 * src = (const block_mxfp4 *)data;
3774
+ block_mxfp4x4 * dst = ( block_mxfp4x4 *)t->data;
3775
+
3776
+ block_mxfp4 dst_tmp[4];
3777
+
3778
+ int nrow = ggml_nrows(t);
3779
+ int nrows_interleaved = 4;
3780
+ int nblocks = t->ne[0] / QK_MXFP4;
3781
+
3782
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
3783
+
3784
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3785
+ return -1;
3786
+ }
3787
+
3788
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3789
+ for (int64_t x = 0; x < nblocks; x++) {
3790
+ for (int i = 0; i < nrows_interleaved; i++) {
3791
+ dst_tmp[i] = src[x + i * nblocks];
3792
+ }
3793
+ *dst++ = make_block_mxfp4x4(dst_tmp, interleave_block);
3794
+ }
3795
+ src += nrows_interleaved * nblocks;
3796
+ }
3797
+ return 0;
3798
+
3799
+ GGML_UNUSED(data_size);
3800
+ }
3801
+
3802
+ static block_mxfp4x8 make_block_mxfp4x8(block_mxfp4 * in, unsigned int blck_size_interleave) {
3803
+ block_mxfp4x8 out;
3804
+
3805
+ for (int i = 0; i < 8; i++) {
3806
+ out.e[i] = in[i].e;
3807
+ }
3808
+
3809
+ const int end = QK_MXFP4 * 4 / blck_size_interleave;
3810
+
3811
+ if (blck_size_interleave == 8) {
3812
+ for (int i = 0; i < end; ++i) {
3813
+ int src_id = i % 8;
3814
+ int src_offset = (i / 8) * blck_size_interleave;
3815
+ int dst_offset = i * blck_size_interleave;
3816
+
3817
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
3818
+ }
3819
+ } else {
3820
+ GGML_ASSERT(false);
3821
+ }
3822
+
3823
+ return out;
3824
+ }
3825
+
3826
+ static int repack_mxfp4_to_mxfp4_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3827
+ GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
3828
+ GGML_ASSERT(interleave_block == 8);
3829
+
3830
+ const block_mxfp4 * src = (const block_mxfp4 *)data;
3831
+ block_mxfp4x8 * dst = ( block_mxfp4x8 *)t->data;
3832
+
3833
+ block_mxfp4 dst_tmp[8];
3834
+
3835
+ int nrow = ggml_nrows(t);
3836
+ int nrows_interleaved = 8;
3837
+ int nblocks = t->ne[0] / QK_MXFP4;
3838
+
3839
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
3840
+
3841
+ if (t->ne[1] % nrows_interleaved != 0) {
3842
+ return -1;
3843
+ }
3844
+
3845
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3846
+ for (int64_t x = 0; x < nblocks; x++) {
3847
+ for (int i = 0; i < nrows_interleaved; i++) {
3848
+ dst_tmp[i] = src[x + i * nblocks];
3849
+ }
3850
+ *dst++ = make_block_mxfp4x8(dst_tmp, interleave_block);
3851
+ }
3852
+ src += nrows_interleaved * nblocks;
3853
+ }
3854
+ return 0;
3855
+
3856
+ GGML_UNUSED(data_size);
3857
+ }
3858
+
1909
3859
  namespace ggml::cpu::repack {
1910
3860
  // repack
1911
3861
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@@ -1936,6 +3886,22 @@ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * da
1936
3886
  return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
1937
3887
  }
1938
3888
 
3889
+ template <> int repack<block_q5_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3890
+ return repack_q5_K_to_q5_K_8_bl(t, 4, data, data_size);
3891
+ }
3892
+
3893
+ template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3894
+ return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
3895
+ }
3896
+
3897
+ template <> int repack<block_q6_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3898
+ return repack_q6_K_to_q6_K_8_bl(t, 4, data, data_size);
3899
+ }
3900
+
3901
+ template <> int repack<block_q6_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3902
+ return repack_q6_K_to_q6_K_8_bl(t, 8, data, data_size);
3903
+ }
3904
+
1939
3905
  template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1940
3906
  return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
1941
3907
  }
@@ -1949,6 +3915,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
1949
3915
  return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1950
3916
  }
1951
3917
 
3918
+ template <> int repack<block_mxfp4, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
3919
+ return repack_mxfp4_to_mxfp4_4_bl(t, 4, data, data_size);
3920
+ }
3921
+
3922
+ template <> int repack<block_mxfp4, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3923
+ return repack_mxfp4_to_mxfp4_8_bl(t, 8, data, data_size);
3924
+ }
3925
+
1952
3926
  template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1953
3927
  return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
1954
3928
  }
@@ -1957,6 +3931,28 @@ template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * da
1957
3931
  return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
1958
3932
  }
1959
3933
 
3934
+ #if defined __riscv_zvfh
3935
+ template <> int repack<block_q4_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3936
+ return repack_q4_0_to_q4_0_16_bl(t, 1, data, data_size);
3937
+ }
3938
+
3939
+ template <> int repack<block_q4_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3940
+ return repack_q4_K_to_q4_K_16_bl(t, 1, data, data_size);
3941
+ }
3942
+
3943
+ template <> int repack<block_iq4_nl, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3944
+ return repack_iq4_nl_to_iq4_nl_16_bl(t, 1, data, data_size);
3945
+ }
3946
+
3947
+ template <> int repack<block_q8_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3948
+ return repack_q8_0_to_q8_0_16_bl(t, 1, data, data_size);
3949
+ }
3950
+
3951
+ template <> int repack<block_q2_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3952
+ return repack_q2_K_to_q2_K_16_bl(t, 1, data, data_size);
3953
+ }
3954
+ #endif
3955
+
1960
3956
  // gemv
1961
3957
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1962
3958
  void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1973,6 +3969,17 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
1973
3969
  ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1974
3970
  }
1975
3971
 
3972
+ template <>
3973
+ void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n,
3974
+ float * s,
3975
+ size_t bs,
3976
+ const void * vx,
3977
+ const void * vy,
3978
+ int nr,
3979
+ int nc) {
3980
+ ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
3981
+ }
3982
+
1976
3983
  template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1977
3984
  ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
1978
3985
  }
@@ -1981,8 +3988,20 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
1981
3988
  ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1982
3989
  }
1983
3990
 
1984
- template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1985
- ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
3991
+ template <> void gemv<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3992
+ ggml_gemv_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
3993
+ }
3994
+
3995
+ template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3996
+ ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
3997
+ }
3998
+
3999
+ template <> void gemv<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4000
+ ggml_gemv_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4001
+ }
4002
+
4003
+ template <> void gemv<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4004
+ ggml_gemv_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1986
4005
  }
1987
4006
 
1988
4007
  template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
@@ -1993,6 +4012,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
1993
4012
  ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1994
4013
  }
1995
4014
 
4015
+ template <> void gemv<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4016
+ ggml_gemv_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4017
+ }
4018
+
4019
+ template <> void gemv<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4020
+ ggml_gemv_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4021
+ }
4022
+
1996
4023
  template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1997
4024
  ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1998
4025
  }
@@ -2001,6 +4028,28 @@ template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
2001
4028
  ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2002
4029
  }
2003
4030
 
4031
+ #if defined __riscv_zvfh
4032
+ template <> void gemv<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4033
+ ggml_gemv_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4034
+ }
4035
+
4036
+ template <> void gemv<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4037
+ ggml_gemv_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4038
+ }
4039
+
4040
+ template <> void gemv<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4041
+ ggml_gemv_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4042
+ }
4043
+
4044
+ template <> void gemv<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4045
+ ggml_gemv_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4046
+ }
4047
+
4048
+ template <> void gemv<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4049
+ ggml_gemv_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4050
+ }
4051
+ #endif
4052
+
2004
4053
  // gemm
2005
4054
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
2006
4055
  void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -2013,20 +4062,43 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
2013
4062
  ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2014
4063
  }
2015
4064
 
2016
- template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2017
- ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4065
+ template <>
4066
+ void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n,
4067
+ float * s,
4068
+ size_t bs,
4069
+ const void * vx,
4070
+ const void * vy,
4071
+ int nr,
4072
+ int nc) {
4073
+ ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
2018
4074
  }
2019
4075
 
2020
- template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2021
- ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4076
+ template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4077
+ ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4078
+ }
4079
+
4080
+ template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4081
+ ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
2022
4082
  }
2023
4083
 
2024
4084
  template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2025
4085
  ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
2026
4086
  }
2027
4087
 
2028
- template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2029
- ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4088
+ template <> void gemm<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4089
+ ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4090
+ }
4091
+
4092
+ template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4093
+ ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4094
+ }
4095
+
4096
+ template <> void gemm<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4097
+ ggml_gemm_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4098
+ }
4099
+
4100
+ template <> void gemm<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4101
+ ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
2030
4102
  }
2031
4103
 
2032
4104
  template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
@@ -2037,6 +4109,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
2037
4109
  ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
2038
4110
  }
2039
4111
 
4112
+ template <> void gemm<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4113
+ ggml_gemm_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4114
+ }
4115
+
4116
+ template <> void gemm<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4117
+ ggml_gemm_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4118
+ }
4119
+
2040
4120
  template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2041
4121
  ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
2042
4122
  }
@@ -2045,6 +4125,28 @@ template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
2045
4125
  ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2046
4126
  }
2047
4127
 
4128
+ #if defined __riscv_zvfh
4129
+ template <> void gemm<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4130
+ ggml_gemm_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4131
+ }
4132
+
4133
+ template <> void gemm<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4134
+ ggml_gemm_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4135
+ }
4136
+
4137
+ template <> void gemm<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4138
+ ggml_gemm_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4139
+ }
4140
+
4141
+ template <> void gemm<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4142
+ ggml_gemm_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4143
+ }
4144
+
4145
+ template <> void gemm<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4146
+ ggml_gemm_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4147
+ }
4148
+ #endif
4149
+
2048
4150
  class tensor_traits_base : public ggml::cpu::tensor_traits {
2049
4151
  public:
2050
4152
  virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -2063,7 +4165,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
2063
4165
  case GGML_OP_MUL_MAT_ID:
2064
4166
  {
2065
4167
  size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
2066
- size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
4168
+ size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
2067
4169
 
2068
4170
  const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
2069
4171
  const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
@@ -2328,7 +4430,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
2328
4430
  auto * wdata = (char *)params->wdata;
2329
4431
  auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
2330
4432
 
2331
- // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
4433
+ // total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
2332
4434
  auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
2333
4435
  struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
2334
4436
 
@@ -2393,20 +4495,19 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
2393
4495
  for (int ir1 = 0; ir1 < nr1; ir1++) {
2394
4496
  struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
2395
4497
 
2396
- const int id = row_mapping.i1; // selected expert index
4498
+ const int id = row_mapping.i1; // selected expert index
2397
4499
 
2398
4500
  const int64_t i11 = id % ne11;
2399
- const int64_t i12 = row_mapping.i2; // row index in src1
4501
+ const int64_t i12 = row_mapping.i2; // row index in src1
2400
4502
 
2401
- const int64_t i1 = id; // selected expert index
2402
- const int64_t i2 = i12; // row
4503
+ const int64_t i1 = id; // selected expert index
4504
+ const int64_t i2 = i12; // row
2403
4505
 
2404
4506
  const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
2405
4507
 
2406
- gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
2407
- (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
2408
- src0_cur + src0_cur_start * nb01,
2409
- src1_col, 1, src0_cur_end - src0_cur_start);
4508
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(
4509
+ ne00, (float *) ((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
4510
+ src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
2410
4511
  }
2411
4512
  }
2412
4513
  #undef MMID_MATRIX_ROW
@@ -2422,7 +4523,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
2422
4523
  } // namespace ggml::cpu::repack
2423
4524
 
2424
4525
  static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
2425
-
2426
4526
  // instance for Q4
2427
4527
  static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
2428
4528
  static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
@@ -2432,6 +4532,14 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2432
4532
  static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
2433
4533
  static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
2434
4534
 
4535
+ // instance for Q5_K
4536
+ static const ggml::cpu::repack::tensor_traits<block_q5_K, 4, 8, GGML_TYPE_Q8_K> q5_K_8x4_q8_K;
4537
+ static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
4538
+
4539
+ // instance for Q6_K
4540
+ static const ggml::cpu::repack::tensor_traits<block_q6_K, 4, 8, GGML_TYPE_Q8_K> q6_K_8x4_q8_K;
4541
+ static const ggml::cpu::repack::tensor_traits<block_q6_K, 8, 8, GGML_TYPE_Q8_K> q6_K_8x8_q8_K;
4542
+
2435
4543
  // instance for Q2
2436
4544
  static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
2437
4545
 
@@ -2439,13 +4547,28 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2439
4547
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
2440
4548
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
2441
4549
 
4550
+ // instance for MXFP4
4551
+ static const ggml::cpu::repack::tensor_traits<block_mxfp4, 4, 4, GGML_TYPE_Q8_0> mxfp4_4x4_q8_0;
4552
+ static const ggml::cpu::repack::tensor_traits<block_mxfp4, 8, 8, GGML_TYPE_Q8_0> mxfp4_8x8_q8_0;
4553
+
2442
4554
  // instance for Q8_0
2443
4555
  static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
2444
4556
  static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
2445
4557
 
4558
+ // instances for RISC-V
4559
+ //
4560
+ // These implement outer-product style matrix multiplication kernels with
4561
+ // an interleave of 1.
4562
+ #if defined __riscv_zvfh
4563
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 1, 16, GGML_TYPE_Q8_0> q4_0_16x1_q8_0;
4564
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 1, 16, GGML_TYPE_Q8_K> q4_K_16x1_q8_K;
4565
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0> iq4_nl_16x1_q8_0;
4566
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 1, 16, GGML_TYPE_Q8_0> q8_0_16x1_q8_0;
4567
+ static const ggml::cpu::repack::tensor_traits<block_q2_K, 1, 16, GGML_TYPE_Q8_K> q2_K_16x1_q8_K;
4568
+ #endif
4569
+
2446
4570
  if (cur->type == GGML_TYPE_Q4_0) {
2447
- if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
2448
- || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
4571
+ if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
2449
4572
  if (cur->ne[1] % 8 == 0) {
2450
4573
  return &q4_0_8x8_q8_0;
2451
4574
  }
@@ -2460,6 +4583,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2460
4583
  return &q4_0_4x4_q8_0;
2461
4584
  }
2462
4585
  }
4586
+ if (ggml_cpu_has_riscv_v()) {
4587
+ #if defined __riscv_zvfh
4588
+ switch (__riscv_vlenb() * 8) {
4589
+ case 128: { break; } // TODO
4590
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q4_0_16x1_q8_0; } break; }
4591
+ case 512: { break; } // TODO
4592
+ case 1024: { break; } // TODO
4593
+ default: { return nullptr; }
4594
+ }
4595
+ #endif
4596
+ }
2463
4597
  } else if (cur->type == GGML_TYPE_Q4_K) {
2464
4598
  if (ggml_cpu_has_avx2()) {
2465
4599
  if (cur->ne[1] % 8 == 0) {
@@ -2476,12 +4610,56 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2476
4610
  return &q4_K_8x4_q8_K;
2477
4611
  }
2478
4612
  }
4613
+ if (ggml_cpu_has_riscv_v()) {
4614
+ #if defined __riscv_zvfh
4615
+ switch (__riscv_vlenb() * 8) {
4616
+ case 128: { break; } // TODO
4617
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q4_K_16x1_q8_K; } break; }
4618
+ case 512: { break; } // TODO
4619
+ case 1024: { break; } // TODO
4620
+ default: { return nullptr; }
4621
+ }
4622
+ #endif
4623
+ }
2479
4624
  } else if (cur->type == GGML_TYPE_Q2_K) {
2480
4625
  if (ggml_cpu_has_avx512()) {
2481
4626
  if (cur->ne[1] % 8 == 0) {
2482
4627
  return &q2_K_8x8_q8_K;
2483
4628
  }
2484
4629
  }
4630
+ if (ggml_cpu_has_riscv_v()) {
4631
+ #if defined __riscv_zvfh
4632
+ switch (__riscv_vlenb() * 8) {
4633
+ case 128: { break; } // TODO
4634
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q2_K_16x1_q8_K; } break; }
4635
+ case 512: { break; } // TODO
4636
+ case 1024: { break; } // TODO
4637
+ default: { return nullptr; }
4638
+ }
4639
+ #endif
4640
+ }
4641
+ } else if (cur->type == GGML_TYPE_Q5_K) {
4642
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
4643
+ if (cur->ne[1] % 8 == 0) {
4644
+ return &q5_K_8x8_q8_K;
4645
+ }
4646
+ }
4647
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4648
+ if (cur->ne[1] % 8 == 0) {
4649
+ return &q5_K_8x4_q8_K;
4650
+ }
4651
+ }
4652
+ } else if (cur->type == GGML_TYPE_Q6_K) {
4653
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
4654
+ if (cur->ne[1] % 8 == 0) {
4655
+ return &q6_K_8x8_q8_K;
4656
+ }
4657
+ }
4658
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4659
+ if (cur->ne[1] % 8 == 0) {
4660
+ return &q6_K_8x4_q8_K;
4661
+ }
4662
+ }
2485
4663
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
2486
4664
  if (ggml_cpu_has_avx2()) {
2487
4665
  if (cur->ne[1] % 8 == 0) {
@@ -2493,6 +4671,28 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2493
4671
  return &iq4_nl_4x4_q8_0;
2494
4672
  }
2495
4673
  }
4674
+ if (ggml_cpu_has_riscv_v()) {
4675
+ #if defined __riscv_zvfh
4676
+ switch (__riscv_vlenb() * 8) {
4677
+ case 128: { break; } // TODO
4678
+ case 256: { if (cur->ne[1] % 16 == 0) { return &iq4_nl_16x1_q8_0; } break; }
4679
+ case 512: { break; } // TODO
4680
+ case 1024: { break; } // TODO
4681
+ default: { return nullptr; }
4682
+ }
4683
+ #endif
4684
+ }
4685
+ } else if (cur->type == GGML_TYPE_MXFP4) {
4686
+ if (ggml_cpu_has_avx2()) {
4687
+ if (cur->ne[1] % 8 == 0) {
4688
+ return &mxfp4_8x8_q8_0;
4689
+ }
4690
+ }
4691
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4692
+ if (cur->ne[1] % 4 == 0) {
4693
+ return &mxfp4_4x4_q8_0;
4694
+ }
4695
+ }
2496
4696
  } else if (cur->type == GGML_TYPE_Q8_0) {
2497
4697
  if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2498
4698
  if (cur->ne[1] % 4 == 0) {
@@ -2504,6 +4704,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2504
4704
  return &q8_0_4x4_q8_0;
2505
4705
  }
2506
4706
  }
4707
+ if (ggml_cpu_has_riscv_v()) {
4708
+ #if defined __riscv_zvfh
4709
+ switch (__riscv_vlenb() * 8) {
4710
+ case 128: { break; } // TODO
4711
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q8_0_16x1_q8_0; } break; }
4712
+ case 512: { break; } // TODO
4713
+ case 1024: { break; } // TODO
4714
+ default: { return nullptr; }
4715
+ }
4716
+ #endif
4717
+ }
2507
4718
  }
2508
4719
 
2509
4720
  return nullptr;