whispercpp 1.3.5 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1017) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/LICENSE +1 -1
  5. data/README.md +133 -3
  6. data/Rakefile +18 -3
  7. data/ext/dependencies.rb +10 -4
  8. data/ext/dependencies_for_windows.rb +17 -0
  9. data/ext/extconf.rb +20 -7
  10. data/ext/options.rb +54 -14
  11. data/ext/options_for_windows.rb +51 -0
  12. data/ext/ruby_whisper.c +56 -46
  13. data/ext/ruby_whisper.h +165 -2
  14. data/ext/ruby_whisper_context.c +297 -126
  15. data/ext/ruby_whisper_context_params.c +163 -0
  16. data/ext/ruby_whisper_log_queue.c +180 -0
  17. data/ext/ruby_whisper_log_settable.h +47 -0
  18. data/ext/ruby_whisper_model.c +0 -1
  19. data/ext/ruby_whisper_parakeet.c +49 -0
  20. data/ext/ruby_whisper_parakeet_context.c +304 -0
  21. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  22. data/ext/ruby_whisper_parakeet_model.c +84 -0
  23. data/ext/ruby_whisper_parakeet_params.c +548 -0
  24. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  25. data/ext/ruby_whisper_parakeet_token.c +188 -0
  26. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  27. data/ext/ruby_whisper_params.c +256 -66
  28. data/ext/ruby_whisper_segment.c +6 -7
  29. data/ext/ruby_whisper_token.c +29 -9
  30. data/ext/ruby_whisper_transcribe.cpp +46 -16
  31. data/ext/ruby_whisper_vad_context.c +48 -1
  32. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  33. data/ext/ruby_whisper_vad_params.c +0 -1
  34. data/ext/ruby_whisper_vad_segment.c +0 -1
  35. data/ext/ruby_whisper_vad_segments.c +0 -1
  36. data/ext/sources/CMakeLists.txt +41 -3
  37. data/ext/sources/CMakePresets.json +95 -0
  38. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  39. data/ext/sources/cmake/parakeet.pc.in +10 -0
  40. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  41. data/ext/sources/cmake/whisper.pc.in +1 -1
  42. data/ext/sources/examples/CMakeLists.txt +4 -2
  43. data/ext/sources/examples/bench/bench.cpp +24 -19
  44. data/ext/sources/examples/cli/cli.cpp +51 -9
  45. data/ext/sources/examples/common-ggml.cpp +4 -0
  46. data/ext/sources/examples/common-whisper.cpp +139 -67
  47. data/ext/sources/examples/common-whisper.h +11 -0
  48. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  49. data/ext/sources/examples/miniaudio.h +4507 -2131
  50. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  51. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  52. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  53. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  54. data/ext/sources/examples/server/server.cpp +213 -163
  55. data/ext/sources/ggml/CMakeLists.txt +29 -15
  56. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  57. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  58. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  59. data/ext/sources/ggml/include/ggml-backend.h +73 -11
  60. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  61. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  62. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  63. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  64. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  65. data/ext/sources/ggml/include/ggml-rpc.h +8 -3
  66. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  67. data/ext/sources/ggml/include/ggml.h +155 -16
  68. data/ext/sources/ggml/include/gguf.h +10 -2
  69. data/ext/sources/ggml/src/CMakeLists.txt +25 -5
  70. data/ext/sources/ggml/src/ggml-alloc.c +9 -10
  71. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  72. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  73. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  74. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
  76. data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
  79. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
  83. data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
  85. data/ext/sources/ggml/src/ggml-common.h +22 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
  88. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  89. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
  91. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
  92. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
  99. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
  101. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  102. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  103. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
  105. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
  106. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
  107. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  108. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
  109. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
  110. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
  111. data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
  112. data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
  113. data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
  115. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  116. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  117. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
  118. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  119. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  120. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  121. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  122. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  123. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  124. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  125. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  126. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  127. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  128. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  129. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  130. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  131. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  132. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  133. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  134. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
  135. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  136. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
  137. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  138. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  139. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
  140. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
  141. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  142. data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
  143. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  144. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  145. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  146. data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
  147. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  148. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
  149. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  150. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
  151. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
  152. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  153. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
  154. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
  155. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
  156. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  157. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
  158. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  159. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  160. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  161. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
  162. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  163. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  164. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
  165. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  166. data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
  167. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
  168. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  169. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
  170. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  171. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
  172. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
  173. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  174. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
  175. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
  176. data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
  177. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  178. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  179. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
  180. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  181. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
  182. data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
  183. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  184. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  185. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  186. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  187. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  188. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  189. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  190. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
  191. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  192. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  193. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
  226. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
  227. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  228. data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
  229. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
  230. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
  231. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  232. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
  233. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
  235. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  254. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  255. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  256. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  257. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  258. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  259. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  260. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  261. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  262. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
  263. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
  264. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  265. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  266. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  267. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  268. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  269. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  270. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  271. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  272. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  273. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  274. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  275. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  276. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  278. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  297. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  302. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
  303. data/ext/sources/ggml/src/ggml-impl.h +68 -1
  304. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  305. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  306. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  307. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  308. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
  309. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
  310. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
  311. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
  312. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
  313. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
  314. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
  315. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
  316. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  317. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
  318. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  367. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  368. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  369. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  370. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  371. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  372. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  373. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  374. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  375. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  376. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  377. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  378. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  379. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  380. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  381. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  382. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  383. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  384. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  385. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  386. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  387. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  388. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  389. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
  390. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  391. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  392. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  393. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  394. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  395. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  396. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  397. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  398. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  399. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  400. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  401. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  402. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  403. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  404. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  405. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  406. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  407. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  412. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  413. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  414. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  415. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  416. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  417. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  418. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  419. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  420. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  421. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  422. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  423. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  424. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  425. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  426. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  427. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  428. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  429. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  430. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  431. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  432. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  433. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  434. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  435. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  436. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  437. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  438. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  439. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  440. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  441. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  442. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  443. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  444. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
  445. data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
  446. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  447. data/ext/sources/ggml/src/ggml-quants.c +385 -119
  448. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  449. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  450. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  451. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  452. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  453. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
  454. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  455. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
  456. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  457. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  458. data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
  459. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
  460. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
  461. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  462. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  463. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  464. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  465. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  466. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  467. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  468. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  469. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
  470. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
  471. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  472. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  473. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  474. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  475. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  476. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  477. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
  478. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  479. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  480. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  481. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
  482. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  483. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  484. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  485. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
  486. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  487. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  488. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  489. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  490. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  491. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  492. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  493. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  494. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
  495. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  496. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  497. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  498. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  499. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  500. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  501. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  502. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  503. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  504. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  505. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  506. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  507. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  508. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  509. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  510. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  511. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  512. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  513. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  514. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  515. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  516. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  517. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  518. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  519. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  520. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  521. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  522. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  523. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  524. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  525. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  526. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  527. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  530. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  531. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  532. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  533. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  534. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  535. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  536. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  537. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  538. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  539. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  540. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  541. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  542. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  543. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  544. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  545. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  546. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  547. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  548. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  549. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  550. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  551. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  552. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  553. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  554. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  555. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
  556. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  557. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  558. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  559. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  560. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  561. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  562. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  563. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  564. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  565. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  566. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  567. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  568. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  569. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  570. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  571. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  572. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  573. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  574. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  575. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  576. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  577. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  578. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  579. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  580. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  581. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  582. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  583. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  584. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  585. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  586. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  587. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  588. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  589. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  590. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  591. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  592. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  593. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  594. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  595. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  596. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  597. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  599. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
  663. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  664. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
  665. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
  666. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  667. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  668. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  669. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  670. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  671. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
  672. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
  673. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
  674. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  675. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
  676. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  677. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
  678. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  679. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  680. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  681. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  682. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  683. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  684. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
  685. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  686. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  687. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
  688. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  689. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  690. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  691. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  692. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
  693. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
  694. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  695. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  696. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  697. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  698. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  699. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  700. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  701. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  702. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  703. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
  704. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  705. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
  706. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  707. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  708. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  709. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  710. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  711. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  712. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
  713. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  714. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
  715. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  716. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
  717. data/ext/sources/ggml/src/ggml.c +268 -52
  718. data/ext/sources/ggml/src/gguf.cpp +377 -47
  719. data/ext/sources/include/parakeet.h +342 -0
  720. data/ext/sources/include/whisper.h +10 -0
  721. data/ext/sources/media/matmul.png +0 -0
  722. data/ext/sources/src/CMakeLists.txt +23 -0
  723. data/ext/sources/src/parakeet-arch.h +188 -0
  724. data/ext/sources/src/parakeet.cpp +3838 -0
  725. data/ext/sources/src/whisper.cpp +62 -40
  726. data/extsources.rb +26 -10
  727. data/lib/whisper/log_settable.rb +36 -0
  728. data/lib/whisper/model/uri.rb +13 -1
  729. data/lib/whisper/output.rb +74 -0
  730. data/sig/whisper.rbs +445 -55
  731. data/test/helper.rb +2 -0
  732. data/test/jfk_reader/jfk_reader.c +50 -7
  733. data/test/test_callback.rb +1 -0
  734. data/test/test_context_params.rb +82 -0
  735. data/test/test_package.rb +6 -5
  736. data/test/test_parakeet.rb +28 -0
  737. data/test/test_parakeet_callback.rb +107 -0
  738. data/test/test_parakeet_context.rb +116 -0
  739. data/test/test_parakeet_context_params.rb +24 -0
  740. data/test/test_parakeet_model.rb +21 -0
  741. data/test/test_parakeet_params.rb +78 -0
  742. data/test/test_parakeet_segment.rb +42 -0
  743. data/test/test_parakeet_token.rb +73 -0
  744. data/test/test_params.rb +2 -0
  745. data/test/test_token.rb +11 -0
  746. data/test/test_vad_context.rb +58 -8
  747. data/test/test_vad_segment.rb +1 -1
  748. data/test/test_whisper.rb +44 -6
  749. data/whispercpp.gemspec +2 -2
  750. metadata +426 -280
  751. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  752. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  753. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  754. data/ext/sources/bindings/javascript/package.json +0 -26
  755. data/ext/sources/bindings/javascript/whisper.js +0 -19
  756. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  757. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  758. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  759. data/ext/sources/examples/addon.node/index.js +0 -59
  760. data/ext/sources/examples/addon.node/package.json +0 -16
  761. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  762. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  763. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  764. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  765. data/ext/sources/examples/coi-serviceworker.js +0 -146
  766. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  767. data/ext/sources/examples/command/command.cpp +0 -802
  768. data/ext/sources/examples/command/commands.txt +0 -9
  769. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  770. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  771. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  772. data/ext/sources/examples/generate-karaoke.sh +0 -57
  773. data/ext/sources/examples/helpers.js +0 -191
  774. data/ext/sources/examples/livestream.sh +0 -112
  775. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  776. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  777. data/ext/sources/examples/lsp/whisper.vim +0 -362
  778. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  779. data/ext/sources/examples/python/whisper_processor.py +0 -54
  780. data/ext/sources/examples/server/bench.js +0 -29
  781. data/ext/sources/examples/server.py +0 -120
  782. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  783. data/ext/sources/examples/stream/stream.cpp +0 -437
  784. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  785. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  786. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  787. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  788. data/ext/sources/examples/sycl/build.sh +0 -22
  789. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  790. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  791. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
  792. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  793. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
  794. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
  795. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
  796. data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
  797. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
  798. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  799. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
  800. data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
  801. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
  802. data/ext/sources/examples/talk-llama/llama-context.h +0 -360
  803. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  804. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
  805. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  806. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  807. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
  808. data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
  809. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
  810. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
  811. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  812. data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
  813. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  814. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  815. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
  816. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  817. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
  818. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
  819. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  820. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  821. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  822. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
  823. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  824. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  825. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  826. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
  827. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  828. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
  829. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
  830. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
  831. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
  832. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
  833. data/ext/sources/examples/talk-llama/llama-model.h +0 -544
  834. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
  835. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  836. data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
  837. data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
  838. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
  839. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
  840. data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
  841. data/ext/sources/examples/talk-llama/llama.h +0 -1540
  842. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
  843. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  844. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  845. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
  846. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  847. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
  848. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
  849. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
  850. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
  851. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
  852. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  853. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  854. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  855. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  856. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  857. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  858. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  859. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
  860. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  861. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
  862. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
  863. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
  864. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  865. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
  866. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  867. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  868. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  869. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
  870. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  871. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  872. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  873. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  874. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  875. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  876. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  877. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
  878. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  879. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  880. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
  881. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
  882. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
  883. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  884. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
  885. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  886. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
  887. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  888. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  889. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  890. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
  891. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  892. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  893. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  894. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
  895. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  896. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
  897. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
  898. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
  899. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
  900. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  901. data/ext/sources/examples/talk-llama/models/models.h +0 -569
  902. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
  903. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  904. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
  905. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  906. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  907. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  908. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  909. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  910. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  911. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  912. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  913. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  914. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  915. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  916. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  917. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
  918. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  919. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
  920. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  921. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  922. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  923. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  924. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
  925. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
  926. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
  927. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
  928. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
  929. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  930. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  931. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
  932. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  933. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  934. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
  935. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  936. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  937. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  938. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  939. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  940. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  941. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  942. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  943. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  944. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  945. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  946. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  947. data/ext/sources/examples/talk-llama/speak +0 -40
  948. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  949. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  950. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  951. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  952. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  953. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
  954. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  955. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  956. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  957. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  958. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  959. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  960. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  961. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  962. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  963. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  964. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  965. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  966. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  967. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  968. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  969. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  970. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
  971. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
  972. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  973. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  974. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  975. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  976. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  977. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  978. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  979. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  980. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  981. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  982. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  983. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  984. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
  985. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  986. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
  987. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
  988. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  989. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  990. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  991. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
  992. data/ext/sources/tests/CMakeLists.txt +0 -112
  993. data/ext/sources/tests/earnings21/eval.mk +0 -58
  994. data/ext/sources/tests/earnings21/eval.py +0 -68
  995. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  996. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  997. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  998. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  999. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  1000. data/ext/sources/tests/en-0-ref.txt +0 -1
  1001. data/ext/sources/tests/en-1-ref.txt +0 -1
  1002. data/ext/sources/tests/en-2-ref.txt +0 -1
  1003. data/ext/sources/tests/es-0-ref.txt +0 -1
  1004. data/ext/sources/tests/librispeech/eval.mk +0 -39
  1005. data/ext/sources/tests/librispeech/eval.py +0 -47
  1006. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  1007. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  1008. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  1009. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  1010. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  1011. data/ext/sources/tests/run-tests.sh +0 -130
  1012. data/ext/sources/tests/test-c.c +0 -3
  1013. data/ext/sources/tests/test-vad-full.cpp +0 -56
  1014. data/ext/sources/tests/test-vad.cpp +0 -83
  1015. data/ext/sources/tests/test-whisper.js +0 -58
  1016. data/lib/whisper/context.rb +0 -15
  1017. data/lib/whisper/segment.rb +0 -58
@@ -48,6 +48,90 @@ static inline int nearest_int(float fval) {
48
48
 
49
49
  extern "C" {
50
50
 
51
+ #if defined __riscv_zvfh
52
+ void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
53
+ assert(QK8_0 == 32);
54
+ assert(k % QK8_0 == 0);
55
+ const int nb = k / QK8_0;
56
+
57
+ block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
58
+
59
+ // scalar
60
+ const int blck_size_interleave = 1;
61
+ float srcv[4][QK8_0];
62
+ float id[4];
63
+
64
+ for (int i = 0; i < nb; i++) {
65
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
66
+ float amax = 0.0f; // absolute max
67
+
68
+ for (int j = 0; j < QK8_0; j++) {
69
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
70
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
71
+ }
72
+
73
+ const float d = amax / ((1 << 7) - 1);
74
+ id[row_iter] = d ? 1.0f / d : 0.0f;
75
+
76
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
77
+ }
78
+
79
+ for (int j = 0; j < QK8_0 * 4; j++) {
80
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
81
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
82
+ src_offset += (j % blck_size_interleave);
83
+
84
+ float x0 = srcv[src_id][src_offset] * id[src_id];
85
+ y[i].qs[j] = roundf(x0);
86
+ }
87
+ }
88
+ }
89
+
90
+ void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
91
+ assert(QK_K == 256);
92
+ assert(k % QK_K == 0);
93
+ const int nb = k / QK_K;
94
+
95
+ block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
96
+
97
+ const int blck_size_interleave = 1;
98
+ float srcv[4][QK_K];
99
+ float iscale[4];
100
+
101
+ for (int i = 0; i < nb; i++) {
102
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
103
+ float amax = 0.0f; // absolute max
104
+ float max = 0;
105
+
106
+ for (int j = 0; j < QK_K; j++) {
107
+ srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
108
+ // Update the maximum value of the corresponding super block
109
+ if(amax < fabsf(srcv[row_iter][j])) {
110
+ amax = fabsf(srcv[row_iter][j]);
111
+ max = srcv[row_iter][j];
112
+ }
113
+ }
114
+
115
+ iscale[row_iter] = amax ? -127.f/max : 0;
116
+ y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
117
+ }
118
+
119
+ for (int j = 0; j < QK_K / 4; j++) {
120
+ y[i].bsums[j] = 0;
121
+ }
122
+ for (int j = 0; j < QK_K * 4; j++) {
123
+ int src_id = j % 4;
124
+ int src_offset = j / 4;
125
+ int index = ((j >> 6) << 2) + (j & 3);
126
+
127
+ float x0 = srcv[src_id][src_offset] * iscale[src_id];
128
+ y[i].qs[j] = nearest_int(x0);
129
+ y[i].bsums[index] += y[i].qs[j];
130
+ }
131
+ }
132
+ }
133
+ #endif
134
+
51
135
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
52
136
  assert(QK8_0 == 32);
53
137
  assert(k % QK8_0 == 0);
@@ -124,7 +208,6 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
124
208
  }
125
209
  }
126
210
 
127
-
128
211
  void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
129
212
  assert(QK_K == 256);
130
213
  assert(k % QK_K == 0);
@@ -256,192 +339,289 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTR
256
339
  ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
257
340
  }
258
341
 
259
- extern "C" {
342
+ #if defined __riscv_zvfh
343
+ template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
344
+ assert(nrow == 4);
345
+ UNUSED(nrow);
346
+ ggml_quantize_mat_q8_0_4x1(x, vy, n_per_row);
347
+ }
260
348
 
261
- void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
262
- const int qk = QK8_0;
263
- const int nb = n / qk;
264
- const int ncols_interleaved = 4;
265
- const int blocklen = 4;
349
+ template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
350
+ assert(nrow == 4);
351
+ UNUSED(nrow);
352
+ ggml_quantize_mat_q8_K_4x1(x, vy, n_per_row);
353
+ }
354
+ #endif
355
+
356
+ template <int M, int N>
357
+ static void ggml_gemv_q6_K_NxM_q8_K_generic_impl(int n,
358
+ float * GGML_RESTRICT s,
359
+ size_t bs,
360
+ const void * GGML_RESTRICT vx,
361
+ const void * GGML_RESTRICT vy,
362
+ int nr,
363
+ int nc) {
364
+ constexpr int blocklen = M;
365
+ constexpr int ncols_interleaved = N;
366
+ const int qk = QK_K;
367
+ const int nb = n / qk;
368
+ const int blocks_per_half = 64 / blocklen;
266
369
 
267
- assert(nr == 1);
268
370
  assert(n % qk == 0);
269
371
  assert(nc % ncols_interleaved == 0);
270
372
 
271
- UNUSED(s);
272
373
  UNUSED(bs);
273
- UNUSED(vx);
274
- UNUSED(vy);
275
374
  UNUSED(nr);
276
- UNUSED(nc);
277
- UNUSED(nb);
278
- UNUSED(ncols_interleaved);
279
- UNUSED(blocklen);
280
375
 
281
- float sumf[4];
282
- int sumi;
376
+ float sumf[8];
283
377
 
284
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
378
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
285
379
  for (int x = 0; x < nc / ncols_interleaved; x++) {
286
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
380
+ const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
381
+
382
+ for (int j = 0; j < ncols_interleaved; j++) {
383
+ sumf[j] = 0.0f;
384
+ }
287
385
 
288
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
289
386
  for (int l = 0; l < nb; l++) {
290
387
  for (int k = 0; k < (qk / (2 * blocklen)); k++) {
388
+ const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
389
+ const int base_h = base_l + 64;
390
+
391
+ const int scale_idx_l = base_l / 16;
392
+ const int scale_idx_h = base_h / 16;
393
+
394
+ const int qh_shift_l = ((base_l % 128) / 32) * 2;
395
+ const int qh_shift_h = ((base_h % 128) / 32) * 2;
396
+
397
+ const int qh_half_l = (base_l / 128) * 32;
398
+ const int qh_half_h = (base_h / 128) * 32;
399
+
291
400
  for (int j = 0; j < ncols_interleaved; j++) {
292
- sumi = 0;
293
- for (int i = 0; i < blocklen; ++i) {
294
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
295
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
296
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
401
+ const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
402
+ const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
403
+
404
+ int sumi_l = 0;
405
+ int sumi_h = 0;
406
+
407
+ for (int i = 0; i < blocklen; i++) {
408
+ const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
409
+ const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
410
+ const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
411
+
412
+ const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
413
+ const int qh_chunk_l = qh_idx_l / blocklen;
414
+ const int qh_pos_l = qh_idx_l % blocklen;
415
+ const int qh_offset_l = qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
416
+ const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
417
+
418
+ const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
419
+ const int qh_chunk_h = qh_idx_h / blocklen;
420
+ const int qh_pos_h = qh_idx_h % blocklen;
421
+ const int qh_offset_h = qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
422
+ const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
423
+
424
+ const int q_l = ((hi_2_l << 4) | l_4) - 32;
425
+ const int q_h = ((hi_2_h << 4) | hi_4) - 32;
426
+
427
+ const int8_t a_l = a_ptr[l].qs[base_l + i];
428
+ const int8_t a_h = a_ptr[l].qs[base_h + i];
429
+
430
+ sumi_l += q_l * a_l;
431
+ sumi_h += q_h * a_h;
297
432
  }
298
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
433
+
434
+ sumf[j] +=
435
+ (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
299
436
  }
300
437
  }
301
438
  }
302
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
439
+
440
+ for (int j = 0; j < ncols_interleaved; j++) {
441
+ s[x * ncols_interleaved + j] = sumf[j];
442
+ }
303
443
  }
304
444
  }
305
445
 
306
- void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
307
- const int qk = QK8_0;
308
- const int nb = n / qk;
309
- const int ncols_interleaved = 4;
310
- const int blocklen = 8;
446
+ template <int M, int N>
447
+ static void ggml_gemm_q6_K_NxM_q8_K_generic_impl(int n,
448
+ float * GGML_RESTRICT s,
449
+ size_t bs,
450
+ const void * GGML_RESTRICT vx,
451
+ const void * GGML_RESTRICT vy,
452
+ int nr,
453
+ int nc) {
454
+ constexpr int blocklen = M;
455
+ constexpr int ncols_interleaved = N;
456
+ const int qk = QK_K;
457
+ const int nb = n / qk;
458
+ const int blocks_per_half = 64 / blocklen;
459
+ const int q8_half_stride = 512;
460
+ const int q8_low_high_step = 256;
311
461
 
312
- assert (n % qk == 0);
313
- assert (nc % ncols_interleaved == 0);
462
+ assert(n % qk == 0);
463
+ assert(nr % 4 == 0);
464
+ assert(nc % ncols_interleaved == 0);
314
465
 
315
- UNUSED(s);
316
466
  UNUSED(bs);
317
- UNUSED(vx);
318
- UNUSED(vy);
319
- UNUSED(nr);
320
- UNUSED(nc);
321
- UNUSED(nb);
322
- UNUSED(ncols_interleaved);
323
- UNUSED(blocklen);
324
467
 
325
- float sumf[4];
326
- int sumi;
468
+ float sumf[4][8];
327
469
 
328
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
329
- for (int x = 0; x < nc / ncols_interleaved; x++) {
330
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
470
+ for (int y = 0; y < nr / 4; y++) {
471
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
472
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
473
+ const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
331
474
 
332
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
333
- for (int l = 0; l < nb; l++) {
334
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
475
+ for (int m = 0; m < 4; m++) {
335
476
  for (int j = 0; j < ncols_interleaved; j++) {
336
- sumi = 0;
337
- for (int i = 0; i < blocklen; ++i) {
338
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
339
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
340
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
341
- }
342
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
477
+ sumf[m][j] = 0.0f;
343
478
  }
344
479
  }
345
- }
346
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
347
- }
348
- }
349
480
 
350
- void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
351
- const int qk = QK8_0;
352
- const int nb = n / qk;
353
- const int ncols_interleaved = 8;
354
- const int blocklen = 8;
481
+ for (int l = 0; l < nb; l++) {
482
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
483
+ const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
484
+ const int base_h = base_l + 64;
355
485
 
356
- assert (n % qk == 0);
357
- assert (nc % ncols_interleaved == 0);
486
+ const int scale_idx_l = base_l / 16;
487
+ const int scale_idx_h = base_h / 16;
358
488
 
359
- UNUSED(s);
360
- UNUSED(bs);
361
- UNUSED(vx);
362
- UNUSED(vy);
363
- UNUSED(nr);
364
- UNUSED(nc);
365
- UNUSED(nb);
366
- UNUSED(ncols_interleaved);
367
- UNUSED(blocklen);
489
+ const int qh_shift_l = ((base_l % 128) / 32) * 2;
490
+ const int qh_shift_h = ((base_h % 128) / 32) * 2;
368
491
 
369
- float sumf[8];
370
- int sumi;
492
+ const int qh_half_l = (base_l / 128) * 32;
493
+ const int qh_half_h = (base_h / 128) * 32;
371
494
 
372
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
373
- for (int x = 0; x < nc / ncols_interleaved; x++) {
374
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
495
+ const int q8_base = (k / blocks_per_half) * q8_half_stride + (k % blocks_per_half) * (blocklen * 4);
375
496
 
376
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
377
- for (int l = 0; l < nb; l++) {
378
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
379
- for (int j = 0; j < ncols_interleaved; j++) {
380
- sumi = 0;
381
- for (int i = 0; i < blocklen; ++i) {
382
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
383
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
384
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
497
+ for (int m = 0; m < 4; m++) {
498
+ for (int j = 0; j < ncols_interleaved; j++) {
499
+ const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
500
+ const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
501
+
502
+ int sumi_l = 0;
503
+ int sumi_h = 0;
504
+
505
+ for (int i = 0; i < blocklen; i++) {
506
+ const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
507
+ const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
508
+ const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
509
+
510
+ const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
511
+ const int qh_chunk_l = qh_idx_l / blocklen;
512
+ const int qh_pos_l = qh_idx_l % blocklen;
513
+ const int qh_offset_l =
514
+ qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
515
+ const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
516
+
517
+ const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
518
+ const int qh_chunk_h = qh_idx_h / blocklen;
519
+ const int qh_pos_h = qh_idx_h % blocklen;
520
+ const int qh_offset_h =
521
+ qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
522
+ const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
523
+
524
+ const int q_l = ((hi_2_l << 4) | l_4) - 32;
525
+ const int q_h = ((hi_2_h << 4) | hi_4) - 32;
526
+
527
+ const int8_t q8_l = a_ptr[l].qs[q8_base + m * blocklen + i];
528
+ const int8_t q8_h = a_ptr[l].qs[q8_base + m * blocklen + i + q8_low_high_step];
529
+
530
+ sumi_l += q_l * q8_l;
531
+ sumi_h += q_h * q8_h;
532
+ }
533
+
534
+ sumf[m][j] += (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) *
535
+ a_ptr[l].d[m];
536
+ }
385
537
  }
386
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
538
+ }
539
+ }
540
+
541
+ for (int m = 0; m < 4; m++) {
542
+ for (int j = 0; j < ncols_interleaved; j++) {
543
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
387
544
  }
388
545
  }
389
546
  }
390
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
391
547
  }
392
548
  }
393
549
 
394
- void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
395
- const int qk = QK_K;
396
- const int nb = n / qk;
397
- const int ncols_interleaved = 8;
398
- const int blocklen = 4;
399
- static const uint32_t kmask1 = 0x3f3f3f3f;
400
- static const uint32_t kmask2 = 0x0f0f0f0f;
401
- static const uint32_t kmask3 = 0x03030303;
550
+ template <int M, int N>
551
+ static void ggml_gemv_q5_K_NxM_q8_K_generic_impl(int n,
552
+ float * GGML_RESTRICT s,
553
+ size_t bs,
554
+ const void * GGML_RESTRICT vx,
555
+ const void * GGML_RESTRICT vy,
556
+ int nr,
557
+ int nc) {
558
+ constexpr int blocklen = M;
559
+ constexpr int ncols_interleaved = N;
560
+ const int qk = QK_K;
561
+ const int nb = n / qk;
562
+ static const uint32_t kmask1 = 0x3f3f3f3f;
563
+ static const uint32_t kmask2 = 0x0f0f0f0f;
564
+ static const uint32_t kmask3 = 0x03030303;
402
565
 
403
- assert (n % qk == 0);
404
- assert (nc % ncols_interleaved == 0);
566
+ assert(n % qk == 0);
567
+ assert(nc % ncols_interleaved == 0);
405
568
 
406
569
  UNUSED(bs);
407
570
  UNUSED(nr);
408
571
 
409
- float sumf[8];
410
- float sum_minf[8];
572
+ float sumf[ncols_interleaved];
573
+ float sum_minf[ncols_interleaved];
411
574
  uint32_t utmp[32];
412
- int sumi1;
413
- int sumi2;
414
- int sumi;
575
+ int sumi1;
576
+ int sumi2;
577
+ int sumi;
415
578
 
416
579
  const block_q8_K * a_ptr = (const block_q8_K *) vy;
417
580
  for (int x = 0; x < nc / ncols_interleaved; x++) {
418
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
581
+ const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
419
582
 
420
583
  for (int j = 0; j < ncols_interleaved; j++) {
421
- sumf[j] = 0.0;
584
+ sumf[j] = 0.0;
422
585
  sum_minf[j] = 0.0;
423
586
  }
424
587
  for (int l = 0; l < nb; l++) {
425
588
  for (int sb = 0; sb < 8; sb++) {
426
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
427
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
589
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
590
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
428
591
  const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
429
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
430
- utmp[sb * 4 + 2] = uaux_0;
592
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
593
+ utmp[sb * 4 + 2] = uaux_0;
431
594
  utmp[sb * 4 + 0] &= kmask1;
432
595
  }
433
596
  for (int k = 0; k < (qk / (2 * blocklen)); k++) {
434
- uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
435
- uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
597
+ constexpr int scale_stride = 32;
598
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
599
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
600
+
601
+ const int qh_shift = (k / (32 / blocklen)) * 2;
436
602
  for (int j = 0; j < ncols_interleaved; j++) {
437
603
  sumi1 = 0;
438
604
  sumi2 = 0;
439
- sumi = 0;
605
+ sumi = 0;
440
606
  for (int i = 0; i < blocklen; ++i) {
441
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
442
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
443
- sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
444
- sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
607
+ const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
608
+
609
+ const int qh_idx = (k * blocklen + i) % 32;
610
+ const int qh_chunk = qh_idx / blocklen;
611
+ const int qh_pos = qh_idx % blocklen;
612
+ const int b_qh_offset = qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
613
+
614
+ const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
615
+ const uint8_t h0 = (qh_val >> qh_shift) & 1;
616
+ const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
617
+
618
+ const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
619
+ const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
620
+
621
+ const int q8_offset = (k / (32 / blocklen)) * 64 + (k % (32 / blocklen)) * blocklen + i;
622
+
623
+ sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
624
+ sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
445
625
  sumi1 = sumi1 * scales_0[j];
446
626
  sumi2 = sumi2 * scales_1[j];
447
627
  sumi += sumi1 + sumi2;
@@ -452,7 +632,8 @@ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
452
632
  for (int sb = 0; sb < 8; sb++) {
453
633
  uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
454
634
  for (int j = 0; j < ncols_interleaved; j++) {
455
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
635
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
636
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
456
637
  }
457
638
  }
458
639
  }
@@ -462,17 +643,123 @@ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
462
643
  }
463
644
  }
464
645
 
465
- void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
466
- const int qk = QK_K;
646
+ template <int M, int N>
647
+ static void ggml_gemm_q5_K_NxM_q8_K_generic_impl(int n,
648
+ float * GGML_RESTRICT s,
649
+ size_t bs,
650
+ const void * GGML_RESTRICT vx,
651
+ const void * GGML_RESTRICT vy,
652
+ int nr,
653
+ int nc) {
654
+ constexpr int blocklen = M;
655
+ constexpr int ncols_interleaved = N;
656
+ const int qk = QK_K;
657
+ const int nb = n / qk;
658
+ static const uint32_t kmask1 = 0x3f3f3f3f;
659
+ static const uint32_t kmask2 = 0x0f0f0f0f;
660
+ static const uint32_t kmask3 = 0x03030303;
661
+
662
+ assert(n % qk == 0);
663
+ assert(nr % 4 == 0);
664
+ assert(nc % ncols_interleaved == 0);
665
+
666
+ float sumf[4][ncols_interleaved];
667
+ float sum_minf[4][ncols_interleaved];
668
+ uint32_t utmp[32];
669
+ int sumi1;
670
+ int sumi2;
671
+ int sumi;
672
+
673
+ for (int y = 0; y < nr / 4; y++) {
674
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
675
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
676
+ const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
677
+ for (int m = 0; m < 4; m++) {
678
+ for (int j = 0; j < ncols_interleaved; j++) {
679
+ sumf[m][j] = 0.0;
680
+ sum_minf[m][j] = 0.0;
681
+ }
682
+ }
683
+ for (int l = 0; l < nb; l++) {
684
+ for (int sb = 0; sb < 8; sb++) {
685
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
686
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
687
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
688
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
689
+ utmp[sb * 4 + 2] = uaux_0;
690
+ utmp[sb * 4 + 0] &= kmask1;
691
+ }
692
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
693
+ constexpr int scale_stride = 32;
694
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
695
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
696
+
697
+ const int qh_shift = (k / (32 / blocklen)) * 2;
698
+ for (int m = 0; m < 4; m++) {
699
+ for (int j = 0; j < ncols_interleaved; j++) {
700
+ sumi1 = 0;
701
+ sumi2 = 0;
702
+ sumi = 0;
703
+ for (int i = 0; i < blocklen; ++i) {
704
+ const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
705
+
706
+ const int qh_idx = (k * blocklen + i) % 32;
707
+ const int qh_chunk = qh_idx / blocklen;
708
+ const int qh_pos = qh_idx % blocklen;
709
+ const int b_qh_offset =
710
+ qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
711
+
712
+ const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
713
+ const uint8_t h0 = (qh_val >> qh_shift) & 1;
714
+ const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
715
+
716
+ const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
717
+ const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
718
+
719
+ const int q8_offset = (k / (32 / blocklen)) * 256 +
720
+ (k % (32 / blocklen)) * 4 * blocklen + m * blocklen + i;
721
+
722
+ sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
723
+ sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
724
+ sumi1 = sumi1 * scales_0[j];
725
+ sumi2 = sumi2 * scales_1[j];
726
+ sumi += sumi1 + sumi2;
727
+ }
728
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
729
+ }
730
+ }
731
+ }
732
+ for (int sb = 0; sb < 8; sb++) {
733
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
734
+ for (int m = 0; m < 4; m++) {
735
+ const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
736
+ for (int j = 0; j < ncols_interleaved; j++) {
737
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
738
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
739
+ }
740
+ }
741
+ }
742
+ }
743
+ for (int m = 0; m < 4; m++) {
744
+ for (int j = 0; j < ncols_interleaved; j++) {
745
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
746
+ }
747
+ }
748
+ }
749
+ }
750
+ }
751
+
752
+ extern "C" {
753
+
754
+ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
755
+ const int qk = QK8_0;
467
756
  const int nb = n / qk;
468
- const int ncols_interleaved = 8;
469
- const int blocklen = 8;
470
- static const uint32_t kmask1 = 0x3f3f3f3f;
471
- static const uint32_t kmask2 = 0x0f0f0f0f;
472
- static const uint32_t kmask3 = 0x03030303;
757
+ const int ncols_interleaved = 4;
758
+ const int blocklen = 4;
473
759
 
474
- assert (n % qk == 0);
475
- assert (nc % ncols_interleaved == 0);
760
+ assert(nr == 1);
761
+ assert(n % qk == 0);
762
+ assert(nc % ncols_interleaved == 0);
476
763
 
477
764
  UNUSED(s);
478
765
  UNUSED(bs);
@@ -484,66 +771,35 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
484
771
  UNUSED(ncols_interleaved);
485
772
  UNUSED(blocklen);
486
773
 
487
- float sumf[8];
488
- float sum_minf[8];
489
- uint32_t utmp[32];
490
- int sumi1;
491
- int sumi2;
774
+ float sumf[4];
492
775
  int sumi;
493
776
 
494
- const block_q8_K * a_ptr = (const block_q8_K *) vy;
777
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
495
778
  for (int x = 0; x < nc / ncols_interleaved; x++) {
496
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
779
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
497
780
 
498
- for (int j = 0; j < ncols_interleaved; j++) {
499
- sumf[j] = 0.0;
500
- sum_minf[j] = 0.0;
501
- }
781
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
502
782
  for (int l = 0; l < nb; l++) {
503
- for (int sb = 0; sb < 8; sb++) {
504
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
505
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
506
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
507
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
508
- utmp[sb * 4 + 2] = uaux_0;
509
- utmp[sb * 4 + 0] &= kmask1;
510
- }
511
783
  for (int k = 0; k < (qk / (2 * blocklen)); k++) {
512
- uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
513
- uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
514
784
  for (int j = 0; j < ncols_interleaved; j++) {
515
- sumi1 = 0;
516
- sumi2 = 0;
517
785
  sumi = 0;
518
786
  for (int i = 0; i < blocklen; ++i) {
519
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
520
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
521
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
522
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
523
- sumi1 = sumi1 * scales_0[j];
524
- sumi2 = sumi2 * scales_1[j];
525
- sumi += sumi1 + sumi2;
787
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
788
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
789
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
526
790
  }
527
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
528
- }
529
- }
530
- for (int sb = 0; sb < 8; sb++) {
531
- uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
532
- for (int j = 0; j < ncols_interleaved; j++) {
533
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
791
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
534
792
  }
535
793
  }
536
794
  }
537
- for (int j = 0; j < ncols_interleaved; j++) {
538
- s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
539
- }
795
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
540
796
  }
541
797
  }
542
798
 
543
- void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
544
- const int qk = QK_K;
799
+ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
800
+ const int qk = QK8_0;
545
801
  const int nb = n / qk;
546
- const int ncols_interleaved = 8;
802
+ const int ncols_interleaved = 4;
547
803
  const int blocklen = 8;
548
804
 
549
805
  assert (n % qk == 0);
@@ -559,82 +815,56 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
559
815
  UNUSED(ncols_interleaved);
560
816
  UNUSED(blocklen);
561
817
 
562
- float sumf[8];
563
- float sum_minf[8];
564
- int sumi1,sumi2,sumi3,sumi4;
818
+ float sumf[4];
565
819
  int sumi;
566
820
 
567
- const block_q8_K * a_ptr = (const block_q8_K *)vy;
568
- for(int x = 0; x < nc / ncols_interleaved; x++) {
569
- const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
570
- for (int j = 0; j < ncols_interleaved; j++) {
571
- sumf[j] = 0.0;
572
- sum_minf[j] = 0.0;
573
- }
821
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
822
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
823
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
824
+
825
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
574
826
  for (int l = 0; l < nb; l++) {
575
- for (int k = 0; k < (qk / (4 * blocklen)); k++) {
576
- const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
577
- const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
578
- const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
579
- const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
827
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
580
828
  for (int j = 0; j < ncols_interleaved; j++) {
581
- sumi1 = 0;
582
- sumi2 = 0;
583
- sumi3 = 0;
584
- sumi4 = 0;
585
829
  sumi = 0;
586
- int offset = ((k / 2) % 2) + j * 2;
587
- for (int i = 0; i < blocklen; ++i){
588
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
589
- const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
590
- const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
591
- const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
592
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
593
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
594
- sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
595
- sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
596
-
597
- sumi1 = sumi1 * (scales_0[offset] & 0xF);
598
- sumi2 = sumi2 * (scales_1[offset] & 0xF);
599
- sumi3 = sumi3 * (scales_2[offset] & 0xF);
600
- sumi4 = sumi4 * (scales_3[offset] & 0xF);
601
- sumi += sumi1 + sumi2 + sumi3 + sumi4;
830
+ for (int i = 0; i < blocklen; ++i) {
831
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
832
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
833
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
602
834
  }
603
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
604
- }
605
- }
606
- for(int sb = 0; sb < 8; sb++) {
607
- const uint8_t *mins = b_ptr[l].scales + sb * 16;
608
- for(int j = 0; j < ncols_interleaved; j++){
609
- sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
835
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
610
836
  }
611
837
  }
612
838
  }
613
- for (int j = 0; j < ncols_interleaved; j++) {
614
- s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
615
- }
839
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
616
840
  }
617
841
  }
618
842
 
619
- void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
843
+ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
620
844
  const int qk = QK8_0;
621
845
  const int nb = n / qk;
622
- const int ncols_interleaved = 4;
623
- const int blocklen = 4;
846
+ const int ncols_interleaved = 8;
847
+ const int blocklen = 8;
624
848
 
625
- assert(nr == 1);
626
- assert(n % qk == 0);
627
- assert(nc % ncols_interleaved == 0);
849
+ assert (n % qk == 0);
850
+ assert (nc % ncols_interleaved == 0);
628
851
 
852
+ UNUSED(s);
629
853
  UNUSED(bs);
854
+ UNUSED(vx);
855
+ UNUSED(vy);
630
856
  UNUSED(nr);
857
+ UNUSED(nc);
858
+ UNUSED(nb);
859
+ UNUSED(ncols_interleaved);
860
+ UNUSED(blocklen);
631
861
 
632
- float sumf[4];
862
+ float sumf[8];
633
863
  int sumi;
634
864
 
635
865
  const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
636
866
  for (int x = 0; x < nc / ncols_interleaved; x++) {
637
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
867
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
638
868
 
639
869
  for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
640
870
  for (int l = 0; l < nb; l++) {
@@ -642,9 +872,9 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
642
872
  for (int j = 0; j < ncols_interleaved; j++) {
643
873
  sumi = 0;
644
874
  for (int i = 0; i < blocklen; ++i) {
645
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
646
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
647
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
875
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
876
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
877
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
648
878
  }
649
879
  sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
650
880
  }
@@ -654,139 +884,1212 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
654
884
  }
655
885
  }
656
886
 
657
- void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
658
- const int qk = QK8_0;
887
+ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
888
+ const int qk = QK_K;
659
889
  const int nb = n / qk;
660
890
  const int ncols_interleaved = 8;
661
- const int blocklen = 8;
891
+ const int blocklen = 4;
892
+ static const uint32_t kmask1 = 0x3f3f3f3f;
893
+ static const uint32_t kmask2 = 0x0f0f0f0f;
894
+ static const uint32_t kmask3 = 0x03030303;
662
895
 
663
- assert(nr == 1);
664
- assert(n % qk == 0);
665
- assert(nc % ncols_interleaved == 0);
896
+ assert (n % qk == 0);
897
+ assert (nc % ncols_interleaved == 0);
666
898
 
667
899
  UNUSED(bs);
668
900
  UNUSED(nr);
669
901
 
670
902
  float sumf[8];
903
+ float sum_minf[8];
904
+ uint32_t utmp[32];
905
+ int sumi1;
906
+ int sumi2;
671
907
  int sumi;
672
908
 
673
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
909
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
674
910
  for (int x = 0; x < nc / ncols_interleaved; x++) {
675
- const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
911
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
676
912
 
677
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
913
+ for (int j = 0; j < ncols_interleaved; j++) {
914
+ sumf[j] = 0.0;
915
+ sum_minf[j] = 0.0;
916
+ }
678
917
  for (int l = 0; l < nb; l++) {
918
+ for (int sb = 0; sb < 8; sb++) {
919
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
920
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
921
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
922
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
923
+ utmp[sb * 4 + 2] = uaux_0;
924
+ utmp[sb * 4 + 0] &= kmask1;
925
+ }
679
926
  for (int k = 0; k < (qk / (2 * blocklen)); k++) {
927
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
928
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
680
929
  for (int j = 0; j < ncols_interleaved; j++) {
681
- sumi = 0;
930
+ sumi1 = 0;
931
+ sumi2 = 0;
932
+ sumi = 0;
933
+ for (int i = 0; i < blocklen; ++i) {
934
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
935
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
936
+ sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
937
+ sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
938
+ sumi1 = sumi1 * scales_0[j];
939
+ sumi2 = sumi2 * scales_1[j];
940
+ sumi += sumi1 + sumi2;
941
+ }
942
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
943
+ }
944
+ }
945
+ for (int sb = 0; sb < 8; sb++) {
946
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
947
+ for (int j = 0; j < ncols_interleaved; j++) {
948
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
949
+ }
950
+ }
951
+ }
952
+ for (int j = 0; j < ncols_interleaved; j++) {
953
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
954
+ }
955
+ }
956
+ }
957
+
958
+ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
959
+ const int qk = QK_K;
960
+ const int nb = n / qk;
961
+ const int ncols_interleaved = 8;
962
+ const int blocklen = 8;
963
+ static const uint32_t kmask1 = 0x3f3f3f3f;
964
+ static const uint32_t kmask2 = 0x0f0f0f0f;
965
+ static const uint32_t kmask3 = 0x03030303;
966
+
967
+ assert (n % qk == 0);
968
+ assert (nc % ncols_interleaved == 0);
969
+
970
+ UNUSED(bs);
971
+ UNUSED(nr);
972
+
973
+ float sumf[8];
974
+ float sum_minf[8];
975
+ uint32_t utmp[32];
976
+ int sumi1;
977
+ int sumi2;
978
+ int sumi;
979
+
980
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
981
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
982
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
983
+
984
+ for (int j = 0; j < ncols_interleaved; j++) {
985
+ sumf[j] = 0.0;
986
+ sum_minf[j] = 0.0;
987
+ }
988
+ for (int l = 0; l < nb; l++) {
989
+ for (int sb = 0; sb < 8; sb++) {
990
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
991
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
992
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
993
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
994
+ utmp[sb * 4 + 2] = uaux_0;
995
+ utmp[sb * 4 + 0] &= kmask1;
996
+ }
997
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
998
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
999
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
1000
+ for (int j = 0; j < ncols_interleaved; j++) {
1001
+ sumi1 = 0;
1002
+ sumi2 = 0;
1003
+ sumi = 0;
1004
+ for (int i = 0; i < blocklen; ++i) {
1005
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1006
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1007
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
1008
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
1009
+ sumi1 = sumi1 * scales_0[j];
1010
+ sumi2 = sumi2 * scales_1[j];
1011
+ sumi += sumi1 + sumi2;
1012
+ }
1013
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1014
+ }
1015
+ }
1016
+ for (int sb = 0; sb < 8; sb++) {
1017
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1018
+ for (int j = 0; j < ncols_interleaved; j++) {
1019
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1020
+ }
1021
+ }
1022
+ }
1023
+ for (int j = 0; j < ncols_interleaved; j++) {
1024
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
1025
+ }
1026
+ }
1027
+ }
1028
+
1029
+ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1030
+ const int qk = QK_K;
1031
+ const int nb = n / qk;
1032
+ const int ncols_interleaved = 8;
1033
+ const int blocklen = 8;
1034
+
1035
+ assert (n % qk == 0);
1036
+ assert (nc % ncols_interleaved == 0);
1037
+
1038
+ UNUSED(s);
1039
+ UNUSED(bs);
1040
+ UNUSED(vx);
1041
+ UNUSED(vy);
1042
+ UNUSED(nr);
1043
+ UNUSED(nc);
1044
+ UNUSED(nb);
1045
+ UNUSED(ncols_interleaved);
1046
+ UNUSED(blocklen);
1047
+
1048
+ float sumf[8];
1049
+ float sum_minf[8];
1050
+ int sumi1,sumi2,sumi3,sumi4;
1051
+ int sumi;
1052
+
1053
+ const block_q8_K * a_ptr = (const block_q8_K *)vy;
1054
+ for(int x = 0; x < nc / ncols_interleaved; x++) {
1055
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
1056
+ for (int j = 0; j < ncols_interleaved; j++) {
1057
+ sumf[j] = 0.0;
1058
+ sum_minf[j] = 0.0;
1059
+ }
1060
+ for (int l = 0; l < nb; l++) {
1061
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
1062
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
1063
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
1064
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
1065
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
1066
+ for (int j = 0; j < ncols_interleaved; j++) {
1067
+ sumi1 = 0;
1068
+ sumi2 = 0;
1069
+ sumi3 = 0;
1070
+ sumi4 = 0;
1071
+ sumi = 0;
1072
+ int offset = ((k / 2) % 2) + j * 2;
1073
+ for (int i = 0; i < blocklen; ++i){
1074
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
1075
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
1076
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
1077
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
1078
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
1079
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
1080
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
1081
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
1082
+
1083
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
1084
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
1085
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
1086
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
1087
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
1088
+ }
1089
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1090
+ }
1091
+ }
1092
+ for(int sb = 0; sb < 8; sb++) {
1093
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
1094
+ for(int j = 0; j < ncols_interleaved; j++){
1095
+ sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1096
+ }
1097
+ }
1098
+ }
1099
+ for (int j = 0; j < ncols_interleaved; j++) {
1100
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
1101
+ }
1102
+ }
1103
+ }
1104
+
1105
+ void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1106
+ ggml_gemv_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
1107
+ }
1108
+
1109
+ void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1110
+ ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
1111
+ }
1112
+
1113
+
1114
+ void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1115
+ ggml_gemv_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
1116
+ }
1117
+
1118
+ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1119
+ ggml_gemv_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
1120
+ }
1121
+
1122
+ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1123
+ const int qk = QK8_0;
1124
+ const int nb = n / qk;
1125
+ const int ncols_interleaved = 4;
1126
+ const int blocklen = 4;
1127
+
1128
+ assert(nr == 1);
1129
+ assert(n % qk == 0);
1130
+ assert(nc % ncols_interleaved == 0);
1131
+
1132
+ UNUSED(bs);
1133
+ UNUSED(nr);
1134
+
1135
+ float sumf[4];
1136
+ int sumi;
1137
+
1138
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1139
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1140
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
1141
+
1142
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1143
+ for (int l = 0; l < nb; l++) {
1144
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1145
+ for (int j = 0; j < ncols_interleaved; j++) {
1146
+ sumi = 0;
1147
+ for (int i = 0; i < blocklen; ++i) {
1148
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1149
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1150
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1151
+ }
1152
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1153
+ }
1154
+ }
1155
+ }
1156
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1157
+ }
1158
+ }
1159
+
1160
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1161
+ const int qk = QK8_0;
1162
+ const int nb = n / qk;
1163
+ const int ncols_interleaved = 8;
1164
+ const int blocklen = 8;
1165
+
1166
+ assert(nr == 1);
1167
+ assert(n % qk == 0);
1168
+ assert(nc % ncols_interleaved == 0);
1169
+
1170
+ UNUSED(bs);
1171
+ UNUSED(nr);
1172
+
1173
+ float sumf[8];
1174
+ int sumi;
1175
+
1176
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1177
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1178
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
1179
+
1180
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1181
+ for (int l = 0; l < nb; l++) {
1182
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1183
+ for (int j = 0; j < ncols_interleaved; j++) {
1184
+ sumi = 0;
682
1185
  for (int i = 0; i < blocklen; ++i) {
683
1186
  const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
684
1187
  const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
685
1188
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
686
1189
  }
687
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1190
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1191
+ }
1192
+ }
1193
+ }
1194
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1195
+ }
1196
+ }
1197
+
1198
+ void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1199
+ const int qk = QK8_0;
1200
+ const int nb = n / qk;
1201
+ const int ncols_interleaved = 4;
1202
+ const int blocklen = 4;
1203
+
1204
+ assert(nr == 1);
1205
+ assert(n % qk == 0);
1206
+ assert(nc % ncols_interleaved == 0);
1207
+
1208
+ UNUSED(bs);
1209
+ UNUSED(nr);
1210
+
1211
+ float sumf[4];
1212
+ int sumi;
1213
+
1214
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1215
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1216
+ const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
1217
+
1218
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1219
+ for (int l = 0; l < nb; l++) {
1220
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1221
+ for (int j = 0; j < ncols_interleaved; j++) {
1222
+ sumi = 0;
1223
+ for (int i = 0; i < blocklen; ++i) {
1224
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1225
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1226
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1227
+ }
1228
+ sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1229
+ }
1230
+ }
1231
+ }
1232
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1233
+ }
1234
+ }
1235
+
1236
+ void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1237
+ const int qk = QK8_0;
1238
+ const int nb = n / qk;
1239
+ const int ncols_interleaved = 8;
1240
+ const int blocklen = 8;
1241
+
1242
+ assert(nr == 1);
1243
+ assert(n % qk == 0);
1244
+ assert(nc % ncols_interleaved == 0);
1245
+
1246
+ UNUSED(bs);
1247
+ UNUSED(nr);
1248
+
1249
+ float sumf[8];
1250
+ int sumi;
1251
+
1252
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1253
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1254
+ const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
1255
+
1256
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1257
+ for (int l = 0; l < nb; l++) {
1258
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1259
+ for (int j = 0; j < ncols_interleaved; j++) {
1260
+ sumi = 0;
1261
+ for (int i = 0; i < blocklen; ++i) {
1262
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1263
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1264
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1265
+ }
1266
+ sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1267
+ }
1268
+ }
1269
+ }
1270
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1271
+ }
1272
+ }
1273
+
1274
+ void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
1275
+ float * GGML_RESTRICT s,
1276
+ size_t bs,
1277
+ const void * GGML_RESTRICT vx,
1278
+ const void * GGML_RESTRICT vy,
1279
+ int nr,
1280
+ int nc) {
1281
+ const int qk = QK8_0;
1282
+ const int nb = n / qk;
1283
+ const int ncols_interleaved = 4;
1284
+ const int blocklen = 4;
1285
+
1286
+ assert(nr == 1);
1287
+ assert(n % qk == 0);
1288
+ assert(nc % ncols_interleaved == 0);
1289
+
1290
+ UNUSED(bs);
1291
+ UNUSED(nr);
1292
+
1293
+ float sumf[4];
1294
+ int sumi;
1295
+
1296
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1297
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1298
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1299
+
1300
+ for (int j = 0; j < ncols_interleaved; j++) {
1301
+ sumf[j] = 0.0;
1302
+ }
1303
+ for (int l = 0; l < nb; l++) {
1304
+ for (int k = 0; k < (qk / blocklen); k++) {
1305
+ for (int j = 0; j < ncols_interleaved; j++) {
1306
+ sumi = 0;
1307
+ for (int i = 0; i < blocklen; ++i) {
1308
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1309
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
1310
+ }
1311
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1312
+ }
1313
+ }
1314
+ }
1315
+ for (int j = 0; j < ncols_interleaved; j++) {
1316
+ s[x * ncols_interleaved + j] = sumf[j];
1317
+ }
1318
+ }
1319
+ }
1320
+
1321
+ void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
1322
+ float * GGML_RESTRICT s,
1323
+ size_t bs,
1324
+ const void * GGML_RESTRICT vx,
1325
+ const void * GGML_RESTRICT vy,
1326
+ int nr,
1327
+ int nc) {
1328
+ const int qk = QK8_0;
1329
+ const int nb = n / qk;
1330
+ const int ncols_interleaved = 4;
1331
+ const int blocklen = 8;
1332
+
1333
+ assert(nr == 1);
1334
+ assert(n % qk == 0);
1335
+ assert(nc % ncols_interleaved == 0);
1336
+
1337
+ UNUSED(bs);
1338
+ UNUSED(nr);
1339
+
1340
+ float sumf[4];
1341
+ int sumi;
1342
+
1343
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1344
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1345
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1346
+
1347
+ for (int j = 0; j < ncols_interleaved; j++) {
1348
+ sumf[j] = 0.0;
1349
+ }
1350
+ for (int l = 0; l < nb; l++) {
1351
+ for (int k = 0; k < (qk / blocklen); k++) {
1352
+ for (int j = 0; j < ncols_interleaved; j++) {
1353
+ sumi = 0;
1354
+ for (int i = 0; i < blocklen; ++i) {
1355
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1356
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
1357
+ }
1358
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1359
+ }
1360
+ }
1361
+ }
1362
+ for (int j = 0; j < ncols_interleaved; j++) {
1363
+ s[x * ncols_interleaved + j] = sumf[j];
1364
+ }
1365
+ }
1366
+ }
1367
+
1368
+ // Only enable these for RISC-V.
1369
+ #if defined __riscv_zvfh
1370
+ void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1371
+ const int qk = QK8_0;
1372
+ const int nb = n / qk;
1373
+ const int ncols_interleaved = 16;
1374
+ const int blocklen = 1;
1375
+
1376
+ assert (n % qk == 0);
1377
+ assert (nc % ncols_interleaved == 0);
1378
+
1379
+ UNUSED(s);
1380
+ UNUSED(bs);
1381
+ UNUSED(vx);
1382
+ UNUSED(vy);
1383
+ UNUSED(nr);
1384
+ UNUSED(nc);
1385
+ UNUSED(nb);
1386
+ UNUSED(ncols_interleaved);
1387
+ UNUSED(blocklen);
1388
+
1389
+ float sumf[16];
1390
+ int sumi;
1391
+
1392
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1393
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1394
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
1395
+
1396
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1397
+ for (int l = 0; l < nb; l++) {
1398
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1399
+ for (int j = 0; j < ncols_interleaved; j++) {
1400
+ sumi = 0;
1401
+ for (int i = 0; i < blocklen; ++i) {
1402
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1403
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1404
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
1405
+ }
1406
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1407
+ }
1408
+ }
1409
+ }
1410
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1411
+ }
1412
+ }
1413
+
1414
+ void ggml_gemv_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1415
+ const int qk = QK_K;
1416
+ const int nb = n / qk;
1417
+ const int ncols_interleaved = 16;
1418
+ const int blocklen = 1;
1419
+ assert (n % qk == 0);
1420
+ assert (nc % ncols_interleaved == 0);
1421
+ UNUSED(s);
1422
+ UNUSED(bs);
1423
+ UNUSED(vx);
1424
+ UNUSED(vy);
1425
+ UNUSED(nr);
1426
+ UNUSED(nc);
1427
+ UNUSED(nb);
1428
+ UNUSED(ncols_interleaved);
1429
+ UNUSED(blocklen);
1430
+ float sumf[16];
1431
+ float sum_minf[16];
1432
+ uint8_t scales[128];
1433
+ uint8_t mins[128];
1434
+ int sumi1;
1435
+ int sumi2;
1436
+ int sumi;
1437
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
1438
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1439
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
1440
+ for (int j = 0; j < ncols_interleaved; j++) {
1441
+ sumf[j] = 0.0f;
1442
+ sum_minf[j] = 0.0f;
1443
+ }
1444
+ for (int l = 0; l < nb; l++) {
1445
+ for (int i = 0; i < 128; i++) {
1446
+ scales[i] = b_ptr[l].scales[i] & 0x0F;
1447
+ mins[i] = b_ptr[l].scales[i] >> 4;
1448
+ }
1449
+ for (int i = 0; i < 64; i++) {
1450
+ scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
1451
+ mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
1452
+ scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
1453
+ mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
1454
+ }
1455
+ for (int sb = 0; sb < 8; sb++) {
1456
+ uint8_t *min = &mins[sb * 16];
1457
+ for (int j = 0; j < ncols_interleaved; j++) {
1458
+ sum_minf[j] += min[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1459
+ }
1460
+ }
1461
+ for (int sb = 0; sb < 8; sb += 2) {
1462
+ uint8_t *scales_0 = &scales[sb * 16];
1463
+ uint8_t *scales_1 = &scales[(sb + 1) * 16];
1464
+ for (int i = 0; i < QK4_0; i++) {
1465
+ for (int j = 0; j < ncols_interleaved; j++) {
1466
+ sumi1 = 0;
1467
+ sumi2 = 0;
1468
+ sumi = 0;
1469
+ const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
1470
+ const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
1471
+ sumi1 = (v0 * a_ptr[l].qs[sb * 32 + i]);
1472
+ sumi2 = (v1 * a_ptr[l].qs[sb * 32 + 32 + i]);
1473
+ sumi1 = sumi1 * scales_0[j];
1474
+ sumi2 = sumi2 * scales_1[j];
1475
+ sumi += sumi1 + sumi2;
1476
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1477
+ }
1478
+ }
1479
+ }
1480
+ }
1481
+ for (int j = 0; j < ncols_interleaved; j++) {
1482
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
1483
+ }
1484
+ }
1485
+ }
1486
+
1487
+ void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1488
+ const int qk = QK8_0;
1489
+ const int nb = n / qk;
1490
+ const int ncols_interleaved = 16;
1491
+ const int blocklen = 1;
1492
+
1493
+ assert(nr == 1);
1494
+ assert(n % qk == 0);
1495
+ assert(nc % ncols_interleaved == 0);
1496
+
1497
+ UNUSED(bs);
1498
+ UNUSED(nr);
1499
+
1500
+ float sumf[16];
1501
+ int sumi;
1502
+
1503
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1504
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1505
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
1506
+
1507
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1508
+ for (int l = 0; l < nb; l++) {
1509
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1510
+ for (int j = 0; j < ncols_interleaved; j++) {
1511
+ sumi = 0;
1512
+ for (int i = 0; i < blocklen; ++i) {
1513
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1514
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1515
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1516
+ }
1517
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1518
+ }
1519
+ }
1520
+ }
1521
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1522
+ }
1523
+ }
1524
+
1525
+ void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1526
+ const int qk = QK8_0;
1527
+ const int nb = n / qk;
1528
+ const int ncols_interleaved = 16;
1529
+ const int blocklen = 1;
1530
+
1531
+ assert(nr == 1);
1532
+ assert(n % qk == 0);
1533
+ assert(nc % ncols_interleaved == 0);
1534
+
1535
+ UNUSED(bs);
1536
+ UNUSED(nr);
1537
+
1538
+ float sumf[16];
1539
+ int sumi;
1540
+
1541
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1542
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1543
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
1544
+
1545
+ for (int j = 0; j < ncols_interleaved; j++) {
1546
+ sumf[j] = 0.0;
1547
+ }
1548
+ for (int l = 0; l < nb; l++) {
1549
+ for (int k = 0; k < (qk / blocklen); k++) {
1550
+ for (int j = 0; j < ncols_interleaved; j++) {
1551
+ sumi = 0;
1552
+ for (int i = 0; i < blocklen; ++i) {
1553
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1554
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
1555
+ }
1556
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1557
+ }
1558
+ }
1559
+ }
1560
+ for (int j = 0; j < ncols_interleaved; j++) {
1561
+ s[x * ncols_interleaved + j] = sumf[j];
1562
+ }
1563
+ }
1564
+ }
1565
+
1566
+ void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1567
+ assert(n % QK_K == 0);
1568
+ assert(nr == 1);
1569
+ assert(nc % 16 == 0);
1570
+
1571
+ UNUSED(bs);
1572
+ UNUSED(nr);
1573
+
1574
+ const int nb = n / QK_K;
1575
+ const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
1576
+ const block_q8_K * y = (const block_q8_K *)vy;
1577
+
1578
+ // Layout: Even-Low(0,2,4,6), Odd-Low(1,3,5,7), Even-High(8...), Odd-High(9...)
1579
+ const int sb_perm[16] = {
1580
+ 0, 4, 1, 5, 2, 6, 3, 7, // 0-7
1581
+ 8, 12, 9, 13, 10, 14, 11, 15 // 8-15
1582
+ };
1583
+
1584
+ for (int col_tile = 0; col_tile < nc; col_tile += 16) {
1585
+ const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
1586
+ const block_q8_K * y_ptr = y;
1587
+
1588
+ float sumf[16] = {0};
1589
+
1590
+ // Loop over K-blocks
1591
+ for (int k_block = 0; k_block < nb; ++k_block) {
1592
+ int32_t isum[16] = {0};
1593
+ int32_t summs[16] = {0};
1594
+
1595
+ const uint8_t * qs_rhs = x_ptr[k_block].qs;
1596
+ const uint8_t * sc_rhs = x_ptr[k_block].scales;
1597
+ const int8_t * qs_lhs = y_ptr[k_block].qs;
1598
+ const int16_t * bs_lhs = y_ptr[k_block].bsums;
1599
+
1600
+ // Iterate over sub-blocks 0..15
1601
+ for (int sb = 0; sb < 16; ++sb) {
1602
+ // Correction Term
1603
+ int16_t bsum = bs_lhs[sb];
1604
+ int scale_offset = sb_perm[sb] * 16;
1605
+
1606
+ for (int col = 0; col < 16; ++col) {
1607
+ uint8_t sc_val = sc_rhs[scale_offset + col];
1608
+ summs[col] += bsum * (sc_val >> 4); // Min is high 4 bits
1609
+ }
1610
+
1611
+ // Main Dot Product
1612
+ // Calculate base offsets for Q2 unpacking based on SB
1613
+ int byte_base;
1614
+ if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
1615
+ else byte_base = (sb % 2 == 0) ? 32 : 48;
1616
+
1617
+ int shift = ((sb / 2) % 4) * 2;
1618
+
1619
+ for (int col = 0; col < 16; ++col) {
1620
+ uint8_t sc_val = sc_rhs[scale_offset + col];
1621
+ int32_t d_sb = sc_val & 0xF; // Scale is low 4 bits
1622
+
1623
+ // Process 16 elements (l=0..15)
1624
+ for (int l = 0; l < 16; ++l) {
1625
+ // Q2: Interleaved by column. Byte `l` contains 4 k-values.
1626
+ int qs_idx = (byte_base + l) * 16 + col;
1627
+ uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
1628
+
1629
+ // Q8: Linear access
1630
+ int k = sb * 16 + l;
1631
+ int8_t q8_val = qs_lhs[k];
1632
+
1633
+ isum[col] += q8_val * q2_val * d_sb;
1634
+ }
1635
+ }
1636
+ }
1637
+
1638
+ // Finalize K-Block
1639
+ for (int col = 0; col < 16; ++col) {
1640
+ float d_lhs = y_ptr[k_block].d;
1641
+ float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
1642
+ float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
1643
+
1644
+ float d_all = d_lhs * d_rhs;
1645
+ float d_min = d_lhs * dm_rhs;
1646
+
1647
+ sumf[col] += (isum[col] * d_all) - (summs[col] * d_min);
1648
+ }
1649
+ }
1650
+
1651
+ for (int col = 0; col < 16; ++col) {
1652
+ s[col_tile + col] = sumf[col];
1653
+ }
1654
+ }
1655
+ }
1656
+ #endif
1657
+
1658
+ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1659
+ const int qk = QK8_0;
1660
+ const int nb = n / qk;
1661
+ const int ncols_interleaved = 4;
1662
+ const int blocklen = 4;
1663
+
1664
+ assert (n % qk == 0);
1665
+ assert (nr % 4 == 0);
1666
+ assert (nc % ncols_interleaved == 0);
1667
+
1668
+ UNUSED(s);
1669
+ UNUSED(bs);
1670
+ UNUSED(vx);
1671
+ UNUSED(vy);
1672
+ UNUSED(nr);
1673
+ UNUSED(nc);
1674
+ UNUSED(nb);
1675
+ UNUSED(ncols_interleaved);
1676
+ UNUSED(blocklen);
1677
+
1678
+ {
1679
+ float sumf[4][4];
1680
+ int sumi;
1681
+
1682
+ for (int y = 0; y < nr / 4; y++) {
1683
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1684
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1685
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1686
+ for (int m = 0; m < 4; m++) {
1687
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1688
+ }
1689
+ for (int l = 0; l < nb; l++) {
1690
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1691
+ for (int m = 0; m < 4; m++) {
1692
+ for (int j = 0; j < ncols_interleaved; j++) {
1693
+ sumi = 0;
1694
+ for (int i = 0; i < blocklen; ++i) {
1695
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1696
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1697
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1698
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1699
+ }
1700
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1701
+ }
1702
+ }
1703
+ }
1704
+ }
1705
+ for (int m = 0; m < 4; m++) {
1706
+ for (int j = 0; j < ncols_interleaved; j++)
1707
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1708
+ }
1709
+ }
1710
+ }
1711
+ }
1712
+ }
1713
+
1714
+ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1715
+ const int qk = QK8_0;
1716
+ const int nb = n / qk;
1717
+ const int ncols_interleaved = 4;
1718
+ const int blocklen = 8;
1719
+
1720
+ assert (n % qk == 0);
1721
+ assert (nr % 4 == 0);
1722
+ assert (nc % ncols_interleaved == 0);
1723
+
1724
+ UNUSED(s);
1725
+ UNUSED(bs);
1726
+ UNUSED(vx);
1727
+ UNUSED(vy);
1728
+ UNUSED(nr);
1729
+ UNUSED(nc);
1730
+ UNUSED(nb);
1731
+ UNUSED(ncols_interleaved);
1732
+ UNUSED(blocklen);
1733
+
1734
+ float sumf[4][4];
1735
+ int sumi;
1736
+
1737
+ for (int y = 0; y < nr / 4; y++) {
1738
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1739
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1740
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1741
+ for (int m = 0; m < 4; m++) {
1742
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1743
+ }
1744
+ for (int l = 0; l < nb; l++) {
1745
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1746
+ for (int m = 0; m < 4; m++) {
1747
+ for (int j = 0; j < ncols_interleaved; j++) {
1748
+ sumi = 0;
1749
+ for (int i = 0; i < blocklen; ++i) {
1750
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1751
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1752
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1753
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1754
+ }
1755
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1756
+ }
1757
+ }
1758
+ }
1759
+ }
1760
+ for (int m = 0; m < 4; m++) {
1761
+ for (int j = 0; j < ncols_interleaved; j++)
1762
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1763
+ }
1764
+ }
1765
+ }
1766
+ }
1767
+
1768
+ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1769
+ const int qk = QK8_0;
1770
+ const int nb = n / qk;
1771
+ const int ncols_interleaved = 8;
1772
+ const int blocklen = 8;
1773
+
1774
+ assert (n % qk == 0);
1775
+ assert (nr % 4 == 0);
1776
+ assert (nc % ncols_interleaved == 0);
1777
+
1778
+ UNUSED(s);
1779
+ UNUSED(bs);
1780
+ UNUSED(vx);
1781
+ UNUSED(vy);
1782
+ UNUSED(nr);
1783
+ UNUSED(nc);
1784
+ UNUSED(nb);
1785
+ UNUSED(ncols_interleaved);
1786
+ UNUSED(blocklen);
1787
+
1788
+ float sumf[4][8];
1789
+ int sumi;
1790
+
1791
+ for (int y = 0; y < nr / 4; y++) {
1792
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1793
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1794
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
1795
+ for (int m = 0; m < 4; m++) {
1796
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1797
+ }
1798
+ for (int l = 0; l < nb; l++) {
1799
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1800
+ for (int m = 0; m < 4; m++) {
1801
+ for (int j = 0; j < ncols_interleaved; j++) {
1802
+ sumi = 0;
1803
+ for (int i = 0; i < blocklen; ++i) {
1804
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1805
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1806
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1807
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1808
+ }
1809
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1810
+ }
1811
+ }
1812
+ }
1813
+ }
1814
+ for (int m = 0; m < 4; m++) {
1815
+ for (int j = 0; j < ncols_interleaved; j++)
1816
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1817
+ }
1818
+ }
1819
+ }
1820
+ }
1821
+
1822
+ void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1823
+ const int qk = QK_K;
1824
+ const int nb = n / qk;
1825
+ const int ncols_interleaved = 8;
1826
+ const int blocklen = 4;
1827
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1828
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1829
+ static const uint32_t kmask3 = 0x03030303;
1830
+
1831
+ assert (n % qk == 0);
1832
+ assert (nr % 4 == 0);
1833
+ assert (nc % ncols_interleaved == 0);
1834
+
1835
+ UNUSED(nb);
1836
+ UNUSED(ncols_interleaved);
1837
+ UNUSED(blocklen);
1838
+
1839
+ float sumf[4][8];
1840
+ float sum_minf[4][8];
1841
+ uint32_t utmp[32];
1842
+ int sumi1;
1843
+ int sumi2;
1844
+ int sumi;
1845
+
1846
+ for (int y = 0; y < nr / 4; y++) {
1847
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
1848
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1849
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
1850
+ for (int m = 0; m < 4; m++) {
1851
+ for (int j = 0; j < ncols_interleaved; j++) {
1852
+ sumf[m][j] = 0.0;
1853
+ sum_minf[m][j] = 0.0;
1854
+ }
1855
+ }
1856
+ for (int l = 0; l < nb; l++) {
1857
+ for (int sb = 0; sb < 8; sb++) {
1858
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
1859
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
1860
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
1861
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
1862
+ utmp[sb * 4 + 2] = uaux_0;
1863
+ utmp[sb * 4 + 0] &= kmask1;
1864
+ }
1865
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1866
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
1867
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
1868
+ for (int m = 0; m < 4; m++) {
1869
+ for (int j = 0; j < ncols_interleaved; j++) {
1870
+ sumi1 = 0;
1871
+ sumi2 = 0;
1872
+ sumi = 0;
1873
+ for (int i = 0; i < blocklen; ++i) {
1874
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1875
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1876
+ sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
1877
+ sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
1878
+ sumi1 = sumi1 * scales_0[j];
1879
+ sumi2 = sumi2 * scales_1[j];
1880
+ sumi += sumi1 + sumi2;
1881
+ }
1882
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1883
+ }
1884
+ }
1885
+ }
1886
+ for (int sb = 0; sb < 8; sb++) {
1887
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
1888
+ for(int m = 0; m < 4; m++) {
1889
+ const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1890
+ for(int j = 0; j < ncols_interleaved; j++) {
1891
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
1892
+ }
1893
+ }
1894
+ }
1895
+ }
1896
+ for (int m = 0; m < 4; m++) {
1897
+ for (int j = 0; j < ncols_interleaved; j++) {
1898
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1899
+ }
1900
+ }
1901
+ }
1902
+ }
1903
+ }
1904
+
1905
+ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1906
+ const int qk = QK_K;
1907
+ const int nb = n / qk;
1908
+ const int ncols_interleaved = 8;
1909
+ const int blocklen = 8;
1910
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1911
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1912
+ static const uint32_t kmask3 = 0x03030303;
1913
+
1914
+ assert (n % qk == 0);
1915
+ assert (nr % 4 == 0);
1916
+ assert (nc % ncols_interleaved == 0);
1917
+
1918
+ UNUSED(bs);
1919
+
1920
+ float sumf[4][8];
1921
+ float sum_minf[4][8];
1922
+ uint32_t utmp[32];
1923
+ int sumi1;
1924
+ int sumi2;
1925
+ int sumi;
1926
+
1927
+ for (int y = 0; y < nr / 4; y++) {
1928
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
1929
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1930
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
1931
+ for (int m = 0; m < 4; m++) {
1932
+ for (int j = 0; j < ncols_interleaved; j++) {
1933
+ sumf[m][j] = 0.0;
1934
+ sum_minf[m][j] = 0.0;
1935
+ }
1936
+ }
1937
+ for (int l = 0; l < nb; l++) {
1938
+ for (int sb = 0; sb < 8; sb++) {
1939
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
1940
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
1941
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
1942
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
1943
+ utmp[sb * 4 + 2] = uaux_0;
1944
+ utmp[sb * 4 + 0] &= kmask1;
1945
+ }
1946
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1947
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
1948
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
1949
+ for (int m = 0; m < 4; m++) {
1950
+ for (int j = 0; j < ncols_interleaved; j++) {
1951
+ sumi1 = 0;
1952
+ sumi2 = 0;
1953
+ sumi = 0;
1954
+ for (int i = 0; i < blocklen; ++i) {
1955
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1956
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1957
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
1958
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
1959
+ sumi1 = sumi1 * scales_0[j];
1960
+ sumi2 = sumi2 * scales_1[j];
1961
+ sumi += sumi1 + sumi2;
1962
+ }
1963
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1964
+ }
1965
+ }
1966
+ }
1967
+ for (int sb = 0; sb < 8; sb++) {
1968
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1969
+ for(int m = 0; m < 4; m++) {
1970
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1971
+ for(int j = 0; j < ncols_interleaved; j++) {
1972
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
1973
+ }
1974
+ }
1975
+ }
1976
+ }
1977
+ for (int m = 0; m < 4; m++) {
1978
+ for (int j = 0; j < ncols_interleaved; j++) {
1979
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
688
1980
  }
689
1981
  }
690
1982
  }
691
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
692
1983
  }
693
1984
  }
694
1985
 
695
- void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
696
- float * GGML_RESTRICT s,
697
- size_t bs,
698
- const void * GGML_RESTRICT vx,
699
- const void * GGML_RESTRICT vy,
700
- int nr,
701
- int nc) {
702
- const int qk = QK8_0;
703
- const int nb = n / qk;
704
- const int ncols_interleaved = 4;
705
- const int blocklen = 4;
1986
+ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1987
+ const int qk = QK_K;
1988
+ const int nb = n / qk;
1989
+ const int ncols_interleaved = 8;
1990
+ const int blocklen = 8;
706
1991
 
707
- assert(nr == 1);
708
- assert(n % qk == 0);
709
- assert(nc % ncols_interleaved == 0);
1992
+ assert (n % qk == 0);
1993
+ assert (nr % 4 == 0);
1994
+ assert (nc % ncols_interleaved == 0);
710
1995
 
1996
+ UNUSED(s);
711
1997
  UNUSED(bs);
1998
+ UNUSED(vx);
1999
+ UNUSED(vy);
712
2000
  UNUSED(nr);
2001
+ UNUSED(nc);
2002
+ UNUSED(nb);
2003
+ UNUSED(ncols_interleaved);
2004
+ UNUSED(blocklen);
713
2005
 
714
- float sumf[4];
715
- int sumi;
716
-
717
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
718
- for (int x = 0; x < nc / ncols_interleaved; x++) {
719
- const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
2006
+ float sumf[4][8];
2007
+ float sum_minf[4][8];
2008
+ int sumi1, sumi2, sumi3, sumi4;
2009
+ int sumi;
720
2010
 
721
- for (int j = 0; j < ncols_interleaved; j++) {
722
- sumf[j] = 0.0;
723
- }
724
- for (int l = 0; l < nb; l++) {
725
- for (int k = 0; k < (qk / blocklen); k++) {
2011
+ for (int y = 0; y < nr / 4; y++) {
2012
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2013
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2014
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
2015
+ for (int m = 0; m < 4; m++) {
726
2016
  for (int j = 0; j < ncols_interleaved; j++) {
727
- sumi = 0;
728
- for (int i = 0; i < blocklen; ++i) {
729
- const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
730
- sumi += v0 * a_ptr[l].qs[k * blocklen + i];
2017
+ sumf[m][j] = 0.0;
2018
+ sum_minf[m][j] = 0.0;
2019
+ }
2020
+ }
2021
+ for (int l = 0; l < nb; l++) {
2022
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
2023
+
2024
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
2025
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
2026
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
2027
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
2028
+ for (int m = 0; m < 4; m++) {
2029
+ for (int j = 0; j < ncols_interleaved; j++) {
2030
+ sumi1 = 0;
2031
+ sumi2 = 0;
2032
+ sumi3 = 0;
2033
+ sumi4 = 0;
2034
+ sumi = 0;
2035
+ int offset = ((k / 2) % 2) + j * 2;
2036
+ for (int i = 0; i < blocklen; ++i){
2037
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
2038
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
2039
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
2040
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
2041
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
2042
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
2043
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
2044
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
2045
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
2046
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
2047
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
2048
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
2049
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
2050
+ }
2051
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
2052
+ }
2053
+ }
2054
+ }
2055
+ for(int sb = 0; sb < 8; sb++) {
2056
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
2057
+ for(int m = 0; m < 4; m++) {
2058
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
2059
+ for(int j = 0; j < ncols_interleaved; j++) {
2060
+ int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
2061
+ sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2062
+ }
731
2063
  }
732
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
733
2064
  }
734
2065
  }
735
- }
736
- for (int j = 0; j < ncols_interleaved; j++) {
737
- s[x * ncols_interleaved + j] = sumf[j];
2066
+
2067
+ for (int m = 0; m < 4; m++) {
2068
+ for (int j = 0; j < ncols_interleaved; j++) {
2069
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
2070
+ }
2071
+ }
738
2072
  }
739
2073
  }
740
2074
  }
741
2075
 
742
- void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
743
- float * GGML_RESTRICT s,
744
- size_t bs,
745
- const void * GGML_RESTRICT vx,
746
- const void * GGML_RESTRICT vy,
747
- int nr,
748
- int nc) {
749
- const int qk = QK8_0;
750
- const int nb = n / qk;
751
- const int ncols_interleaved = 4;
752
- const int blocklen = 8;
753
-
754
- assert(nr == 1);
755
- assert(n % qk == 0);
756
- assert(nc % ncols_interleaved == 0);
757
-
758
- UNUSED(bs);
759
- UNUSED(nr);
2076
+ void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2077
+ ggml_gemm_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
2078
+ }
760
2079
 
761
- float sumf[4];
762
- int sumi;
2080
+ void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2081
+ ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
2082
+ }
763
2083
 
764
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
765
- for (int x = 0; x < nc / ncols_interleaved; x++) {
766
- const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
2084
+ void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2085
+ ggml_gemm_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
2086
+ }
767
2087
 
768
- for (int j = 0; j < ncols_interleaved; j++) {
769
- sumf[j] = 0.0;
770
- }
771
- for (int l = 0; l < nb; l++) {
772
- for (int k = 0; k < (qk / blocklen); k++) {
773
- for (int j = 0; j < ncols_interleaved; j++) {
774
- sumi = 0;
775
- for (int i = 0; i < blocklen; ++i) {
776
- const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
777
- sumi += v0 * a_ptr[l].qs[k * blocklen + i];
778
- }
779
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
780
- }
781
- }
782
- }
783
- for (int j = 0; j < ncols_interleaved; j++) {
784
- s[x * ncols_interleaved + j] = sumf[j];
785
- }
786
- }
2088
+ void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2089
+ ggml_gemm_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
787
2090
  }
788
2091
 
789
- void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2092
+ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
790
2093
  const int qk = QK8_0;
791
2094
  const int nb = n / qk;
792
2095
  const int ncols_interleaved = 4;
@@ -813,7 +2116,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
813
2116
  for (int y = 0; y < nr / 4; y++) {
814
2117
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
815
2118
  for (int x = 0; x < nc / ncols_interleaved; x++) {
816
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
2119
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
817
2120
  for (int m = 0; m < 4; m++) {
818
2121
  for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
819
2122
  }
@@ -823,10 +2126,10 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
823
2126
  for (int j = 0; j < ncols_interleaved; j++) {
824
2127
  sumi = 0;
825
2128
  for (int i = 0; i < blocklen; ++i) {
826
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
827
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2129
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2130
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
828
2131
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
829
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2132
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
830
2133
  }
831
2134
  sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
832
2135
  }
@@ -842,33 +2145,23 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
842
2145
  }
843
2146
  }
844
2147
 
845
- void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2148
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
846
2149
  const int qk = QK8_0;
847
2150
  const int nb = n / qk;
848
- const int ncols_interleaved = 4;
2151
+ const int ncols_interleaved = 8;
849
2152
  const int blocklen = 8;
850
2153
 
851
- assert (n % qk == 0);
852
- assert (nr % 4 == 0);
853
- assert (nc % ncols_interleaved == 0);
854
-
855
- UNUSED(s);
856
- UNUSED(bs);
857
- UNUSED(vx);
858
- UNUSED(vy);
859
- UNUSED(nr);
860
- UNUSED(nc);
861
- UNUSED(nb);
862
- UNUSED(ncols_interleaved);
863
- UNUSED(blocklen);
2154
+ assert(n % qk == 0);
2155
+ assert(nr % 4 == 0);
2156
+ assert(nc % ncols_interleaved == 0);
864
2157
 
865
- float sumf[4][4];
2158
+ float sumf[4][8];
866
2159
  int sumi;
867
2160
 
868
2161
  for (int y = 0; y < nr / 4; y++) {
869
2162
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
870
2163
  for (int x = 0; x < nc / ncols_interleaved; x++) {
871
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
2164
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
872
2165
  for (int m = 0; m < 4; m++) {
873
2166
  for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
874
2167
  }
@@ -878,10 +2171,10 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
878
2171
  for (int j = 0; j < ncols_interleaved; j++) {
879
2172
  sumi = 0;
880
2173
  for (int i = 0; i < blocklen; ++i) {
881
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
882
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2174
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2175
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
883
2176
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
884
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2177
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
885
2178
  }
886
2179
  sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
887
2180
  }
@@ -896,25 +2189,59 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
896
2189
  }
897
2190
  }
898
2191
 
899
- void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2192
+ void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
900
2193
  const int qk = QK8_0;
901
2194
  const int nb = n / qk;
902
- const int ncols_interleaved = 8;
903
- const int blocklen = 8;
2195
+ const int ncols_interleaved = 4;
2196
+ const int blocklen = 4;
904
2197
 
905
- assert (n % qk == 0);
906
- assert (nr % 4 == 0);
907
- assert (nc % ncols_interleaved == 0);
2198
+ assert(n % qk == 0);
2199
+ assert(nr % 4 == 0);
2200
+ assert(nc % ncols_interleaved == 0);
908
2201
 
909
- UNUSED(s);
910
- UNUSED(bs);
911
- UNUSED(vx);
912
- UNUSED(vy);
913
- UNUSED(nr);
914
- UNUSED(nc);
915
- UNUSED(nb);
916
- UNUSED(ncols_interleaved);
917
- UNUSED(blocklen);
2202
+ float sumf[4][4];
2203
+ int sumi;
2204
+
2205
+ for (int y = 0; y < nr / 4; y++) {
2206
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2207
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2208
+ const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
2209
+ for (int m = 0; m < 4; m++) {
2210
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2211
+ }
2212
+ for (int l = 0; l < nb; l++) {
2213
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2214
+ for (int m = 0; m < 4; m++) {
2215
+ for (int j = 0; j < ncols_interleaved; j++) {
2216
+ sumi = 0;
2217
+ for (int i = 0; i < blocklen; ++i) {
2218
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2219
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
2220
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2221
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2222
+ }
2223
+ sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2224
+ }
2225
+ }
2226
+ }
2227
+ }
2228
+ for (int m = 0; m < 4; m++) {
2229
+ for (int j = 0; j < ncols_interleaved; j++)
2230
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2231
+ }
2232
+ }
2233
+ }
2234
+ }
2235
+
2236
+ void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2237
+ const int qk = QK8_0;
2238
+ const int nb = n / qk;
2239
+ const int ncols_interleaved = 8;
2240
+ const int blocklen = 8;
2241
+
2242
+ assert(n % qk == 0);
2243
+ assert(nr % 4 == 0);
2244
+ assert(nc % ncols_interleaved == 0);
918
2245
 
919
2246
  float sumf[4][8];
920
2247
  int sumi;
@@ -922,7 +2249,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
922
2249
  for (int y = 0; y < nr / 4; y++) {
923
2250
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
924
2251
  for (int x = 0; x < nc / ncols_interleaved; x++) {
925
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
2252
+ const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
926
2253
  for (int m = 0; m < 4; m++) {
927
2254
  for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
928
2255
  }
@@ -932,12 +2259,12 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
932
2259
  for (int j = 0; j < ncols_interleaved; j++) {
933
2260
  sumi = 0;
934
2261
  for (int i = 0; i < blocklen; ++i) {
935
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
936
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2262
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2263
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
937
2264
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
938
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2265
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
939
2266
  }
940
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2267
+ sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
941
2268
  }
942
2269
  }
943
2270
  }
@@ -950,183 +2277,119 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
950
2277
  }
951
2278
  }
952
2279
 
953
- void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
954
- const int qk = QK_K;
955
- const int nb = n / qk;
956
- const int ncols_interleaved = 8;
957
- const int blocklen = 4;
958
- static const uint32_t kmask1 = 0x3f3f3f3f;
959
- static const uint32_t kmask2 = 0x0f0f0f0f;
960
- static const uint32_t kmask3 = 0x03030303;
961
-
962
- assert (n % qk == 0);
963
- assert (nr % 4 == 0);
964
- assert (nc % ncols_interleaved == 0);
2280
+ void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
2281
+ float * GGML_RESTRICT s,
2282
+ size_t bs,
2283
+ const void * GGML_RESTRICT vx,
2284
+ const void * GGML_RESTRICT vy,
2285
+ int nr,
2286
+ int nc) {
2287
+ const int qk = QK8_0;
2288
+ const int nb = n / qk;
2289
+ const int ncols_interleaved = 4;
2290
+ const int blocklen = 4;
965
2291
 
966
- UNUSED(nb);
967
- UNUSED(ncols_interleaved);
968
- UNUSED(blocklen);
2292
+ assert(n % qk == 0);
2293
+ assert(nr % 4 == 0);
2294
+ assert(nc % ncols_interleaved == 0);
969
2295
 
970
- float sumf[4][8];
971
- float sum_minf[4][8];
972
- uint32_t utmp[32];
973
- int sumi1;
974
- int sumi2;
975
- int sumi;
2296
+ float sumf[4][4];
2297
+ int sumi;
976
2298
 
977
2299
  for (int y = 0; y < nr / 4; y++) {
978
- const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2300
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
979
2301
  for (int x = 0; x < nc / ncols_interleaved; x++) {
980
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
2302
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
981
2303
  for (int m = 0; m < 4; m++) {
982
2304
  for (int j = 0; j < ncols_interleaved; j++) {
983
2305
  sumf[m][j] = 0.0;
984
- sum_minf[m][j] = 0.0;
985
2306
  }
986
2307
  }
987
2308
  for (int l = 0; l < nb; l++) {
988
- for (int sb = 0; sb < 8; sb++) {
989
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
990
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
991
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
992
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
993
- utmp[sb * 4 + 2] = uaux_0;
994
- utmp[sb * 4 + 0] &= kmask1;
995
- }
996
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
997
- uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
998
- uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
2309
+ for (int k = 0; k < (qk / blocklen); k++) {
999
2310
  for (int m = 0; m < 4; m++) {
1000
2311
  for (int j = 0; j < ncols_interleaved; j++) {
1001
- sumi1 = 0;
1002
- sumi2 = 0;
1003
2312
  sumi = 0;
1004
2313
  for (int i = 0; i < blocklen; ++i) {
1005
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1006
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1007
- sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
1008
- sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
1009
- sumi1 = sumi1 * scales_0[j];
1010
- sumi2 = sumi2 * scales_1[j];
1011
- sumi += sumi1 + sumi2;
2314
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
2315
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1012
2316
  }
1013
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1014
- }
1015
- }
1016
- }
1017
- for (int sb = 0; sb < 8; sb++) {
1018
- uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
1019
- for(int m = 0; m < 4; m++) {
1020
- const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1021
- for(int j = 0; j < ncols_interleaved; j++) {
1022
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2317
+ sumf[m][j] +=
2318
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1023
2319
  }
1024
2320
  }
1025
2321
  }
1026
2322
  }
1027
2323
  for (int m = 0; m < 4; m++) {
1028
2324
  for (int j = 0; j < ncols_interleaved; j++) {
1029
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
2325
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1030
2326
  }
1031
2327
  }
1032
2328
  }
1033
2329
  }
1034
2330
  }
1035
2331
 
1036
- void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1037
- const int qk = QK_K;
1038
- const int nb = n / qk;
1039
- const int ncols_interleaved = 8;
1040
- const int blocklen = 8;
1041
- static const uint32_t kmask1 = 0x3f3f3f3f;
1042
- static const uint32_t kmask2 = 0x0f0f0f0f;
1043
- static const uint32_t kmask3 = 0x03030303;
1044
2332
 
1045
- assert (n % qk == 0);
1046
- assert (nr % 4 == 0);
1047
- assert (nc % ncols_interleaved == 0);
1048
2333
 
1049
- UNUSED(s);
1050
- UNUSED(bs);
1051
- UNUSED(vx);
1052
- UNUSED(vy);
1053
- UNUSED(nr);
1054
- UNUSED(nc);
1055
- UNUSED(nb);
1056
- UNUSED(ncols_interleaved);
1057
- UNUSED(blocklen);
2334
+ void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
2335
+ float * GGML_RESTRICT s,
2336
+ size_t bs,
2337
+ const void * GGML_RESTRICT vx,
2338
+ const void * GGML_RESTRICT vy,
2339
+ int nr,
2340
+ int nc) {
2341
+ const int qk = QK8_0;
2342
+ const int nb = n / qk;
2343
+ const int ncols_interleaved = 4;
2344
+ const int blocklen = 8;
1058
2345
 
1059
- float sumf[4][8];
1060
- float sum_minf[4][8];
1061
- uint32_t utmp[32];
1062
- int sumi1;
1063
- int sumi2;
1064
- int sumi;
2346
+ assert(n % qk == 0);
2347
+ assert(nr % 4 == 0);
2348
+ assert(nc % ncols_interleaved == 0);
2349
+
2350
+ float sumf[4][4];
2351
+ int sumi;
1065
2352
 
1066
2353
  for (int y = 0; y < nr / 4; y++) {
1067
- const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2354
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1068
2355
  for (int x = 0; x < nc / ncols_interleaved; x++) {
1069
- const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
2356
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1070
2357
  for (int m = 0; m < 4; m++) {
1071
2358
  for (int j = 0; j < ncols_interleaved; j++) {
1072
2359
  sumf[m][j] = 0.0;
1073
- sum_minf[m][j] = 0.0;
1074
2360
  }
1075
2361
  }
1076
2362
  for (int l = 0; l < nb; l++) {
1077
- for (int sb = 0; sb < 8; sb++) {
1078
- memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
1079
- utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
1080
- const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
1081
- utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
1082
- utmp[sb * 4 + 2] = uaux_0;
1083
- utmp[sb * 4 + 0] &= kmask1;
1084
- }
1085
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1086
- uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
1087
- uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
2363
+ for (int k = 0; k < (qk / blocklen); k++) {
1088
2364
  for (int m = 0; m < 4; m++) {
1089
2365
  for (int j = 0; j < ncols_interleaved; j++) {
1090
- sumi1 = 0;
1091
- sumi2 = 0;
1092
2366
  sumi = 0;
1093
2367
  for (int i = 0; i < blocklen; ++i) {
1094
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1095
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1096
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
1097
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
1098
- sumi1 = sumi1 * scales_0[j];
1099
- sumi2 = sumi2 * scales_1[j];
1100
- sumi += sumi1 + sumi2;
2368
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
2369
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1101
2370
  }
1102
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1103
- }
1104
- }
1105
- }
1106
- for (int sb = 0; sb < 8; sb++) {
1107
- uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1108
- for(int m = 0; m < 4; m++) {
1109
- const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1110
- for(int j = 0; j < ncols_interleaved; j++) {
1111
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2371
+ sumf[m][j] +=
2372
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1112
2373
  }
1113
2374
  }
1114
2375
  }
1115
2376
  }
1116
2377
  for (int m = 0; m < 4; m++) {
1117
2378
  for (int j = 0; j < ncols_interleaved; j++) {
1118
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
2379
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1119
2380
  }
1120
2381
  }
1121
2382
  }
1122
2383
  }
1123
2384
  }
1124
2385
 
1125
- void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1126
- const int qk = QK_K;
2386
+ // Only enable these for RISC-V.
2387
+ #if defined __riscv_zvfh
2388
+ void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2389
+ const int qk = QK8_0;
1127
2390
  const int nb = n / qk;
1128
- const int ncols_interleaved = 8;
1129
- const int blocklen = 8;
2391
+ const int ncols_interleaved = 16;
2392
+ const int blocklen = 1;
1130
2393
 
1131
2394
  assert (n % qk == 0);
1132
2395
  assert (nr % 4 == 0);
@@ -1142,82 +2405,45 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
1142
2405
  UNUSED(ncols_interleaved);
1143
2406
  UNUSED(blocklen);
1144
2407
 
1145
- float sumf[4][8];
1146
- float sum_minf[4][8];
1147
- int sumi1, sumi2, sumi3, sumi4;
2408
+ float sumf[4][16];
1148
2409
  int sumi;
1149
2410
 
1150
2411
  for (int y = 0; y < nr / 4; y++) {
1151
- const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2412
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1152
2413
  for (int x = 0; x < nc / ncols_interleaved; x++) {
1153
- const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
2414
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
1154
2415
  for (int m = 0; m < 4; m++) {
1155
- for (int j = 0; j < ncols_interleaved; j++) {
1156
- sumf[m][j] = 0.0;
1157
- sum_minf[m][j] = 0.0;
1158
- }
2416
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1159
2417
  }
1160
2418
  for (int l = 0; l < nb; l++) {
1161
- for (int k = 0; k < (qk / (4 * blocklen)); k++) {
1162
-
1163
- const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
1164
- const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
1165
- const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
1166
- const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
2419
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1167
2420
  for (int m = 0; m < 4; m++) {
1168
2421
  for (int j = 0; j < ncols_interleaved; j++) {
1169
- sumi1 = 0;
1170
- sumi2 = 0;
1171
- sumi3 = 0;
1172
- sumi4 = 0;
1173
2422
  sumi = 0;
1174
- int offset = ((k / 2) % 2) + j * 2;
1175
- for (int i = 0; i < blocklen; ++i){
1176
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
1177
- const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
1178
- const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
1179
- const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
1180
- sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
1181
- sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
1182
- sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
1183
- sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
1184
- sumi1 = sumi1 * (scales_0[offset] & 0xF);
1185
- sumi2 = sumi2 * (scales_1[offset] & 0xF);
1186
- sumi3 = sumi3 * (scales_2[offset] & 0xF);
1187
- sumi4 = sumi4 * (scales_3[offset] & 0xF);
1188
- sumi += sumi1 + sumi2 + sumi3 + sumi4;
2423
+ for (int i = 0; i < blocklen; ++i) {
2424
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
2425
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2426
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2427
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1189
2428
  }
1190
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1191
- }
1192
- }
1193
- }
1194
- for(int sb = 0; sb < 8; sb++) {
1195
- const uint8_t *mins = b_ptr[l].scales + sb * 16;
1196
- for(int m = 0; m < 4; m++) {
1197
- const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1198
- for(int j = 0; j < ncols_interleaved; j++) {
1199
- int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
1200
- sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2429
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1201
2430
  }
1202
2431
  }
1203
2432
  }
1204
2433
  }
1205
-
1206
2434
  for (int m = 0; m < 4; m++) {
1207
- for (int j = 0; j < ncols_interleaved; j++) {
1208
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1209
- }
2435
+ for (int j = 0; j < ncols_interleaved; j++)
2436
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1210
2437
  }
1211
2438
  }
1212
2439
  }
1213
2440
  }
1214
2441
 
1215
-
1216
- void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1217
- const int qk = QK8_0;
2442
+ void ggml_gemm_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2443
+ const int qk = QK_K;
1218
2444
  const int nb = n / qk;
1219
- const int ncols_interleaved = 4;
1220
- const int blocklen = 4;
2445
+ const int ncols_interleaved = 16;
2446
+ const int blocklen = 1;
1221
2447
 
1222
2448
  assert (n % qk == 0);
1223
2449
  assert (nr % 4 == 0);
@@ -1233,59 +2459,97 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
1233
2459
  UNUSED(ncols_interleaved);
1234
2460
  UNUSED(blocklen);
1235
2461
 
1236
- {
1237
- float sumf[4][4];
1238
- int sumi;
2462
+ float sumf[4][16];
2463
+ float sum_minf[4][16];
2464
+ uint8_t scales[128];
2465
+ uint8_t mins[128];
2466
+ int sumi1;
2467
+ int sumi2;
2468
+ int sumi;
2469
+
2470
+ for (int y = 0; y < nr / 4; y++) {
2471
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2472
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2473
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
2474
+ for (int m = 0; m < 4; m++) {
2475
+ for (int j = 0; j < ncols_interleaved; j++) {
2476
+ sumf[m][j] = 0.0;
2477
+ sum_minf[m][j] = 0.0;
2478
+ }
2479
+ }
2480
+ for (int l = 0; l < nb; l++) {
2481
+ for (int i = 0; i < 128; i++) {
2482
+ scales[i] = b_ptr[l].scales[i] & 0x0F;
2483
+ mins[i] = b_ptr[l].scales[i] >> 4;
2484
+ }
2485
+ for (int i = 0; i < 64; i++) {
2486
+ scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
2487
+ mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
2488
+ scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
2489
+ mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
2490
+ }
1239
2491
 
1240
- for (int y = 0; y < nr / 4; y++) {
1241
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1242
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1243
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
1244
- for (int m = 0; m < 4; m++) {
1245
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2492
+ for (int sb = 0; sb < 8; sb++) {
2493
+ uint8_t *min = &mins[sb * 16];
2494
+ for(int m = 0; m < 4; m++) {
2495
+ const int16_t bsums = a_ptr[l].bsums[sb * 8 + m] + a_ptr[l].bsums[sb * 8 + m + 4];
2496
+ for(int j = 0; j < ncols_interleaved; j++) {
2497
+ sum_minf[m][j] += min[j] * bsums * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2498
+ }
2499
+ }
1246
2500
  }
1247
- for (int l = 0; l < nb; l++) {
1248
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2501
+
2502
+ for (int sb = 0; sb < 8; sb += 2) {
2503
+ uint8_t *scales_0 = &scales[sb * 16];
2504
+ uint8_t *scales_1 = &scales[(sb + 1) * 16];
2505
+
2506
+ for (int i = 0; i < QK4_0; i++) {
1249
2507
  for (int m = 0; m < 4; m++) {
1250
2508
  for (int j = 0; j < ncols_interleaved; j++) {
2509
+ sumi1 = 0;
2510
+ sumi2 = 0;
1251
2511
  sumi = 0;
1252
- for (int i = 0; i < blocklen; ++i) {
1253
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1254
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1255
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1256
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
1257
- }
1258
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2512
+
2513
+ const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
2514
+ const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
2515
+ sumi1 = (v0 * a_ptr[l].qs[sb * 4 * 32 + i * 4 + m]);
2516
+ sumi2 = (v1 * a_ptr[l].qs[sb * 4 * 32 + 32 * 4 + i * 4 + m]);
2517
+ sumi1 = sumi1 * scales_0[j];
2518
+ sumi2 = sumi2 * scales_1[j];
2519
+ sumi += sumi1 + sumi2;
2520
+
2521
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1259
2522
  }
1260
2523
  }
1261
2524
  }
1262
2525
  }
1263
- for (int m = 0; m < 4; m++) {
1264
- for (int j = 0; j < ncols_interleaved; j++)
1265
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2526
+ }
2527
+ for (int m = 0; m < 4; m++) {
2528
+ for (int j = 0; j < ncols_interleaved; j++) {
2529
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1266
2530
  }
1267
2531
  }
1268
2532
  }
1269
2533
  }
1270
2534
  }
1271
2535
 
1272
- void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2536
+ void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1273
2537
  const int qk = QK8_0;
1274
2538
  const int nb = n / qk;
1275
- const int ncols_interleaved = 8;
1276
- const int blocklen = 8;
2539
+ const int ncols_interleaved = 16;
2540
+ const int blocklen = 1;
1277
2541
 
1278
2542
  assert(n % qk == 0);
1279
2543
  assert(nr % 4 == 0);
1280
2544
  assert(nc % ncols_interleaved == 0);
1281
2545
 
1282
- float sumf[4][8];
2546
+ float sumf[4][16];
1283
2547
  int sumi;
1284
2548
 
1285
2549
  for (int y = 0; y < nr / 4; y++) {
1286
2550
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1287
2551
  for (int x = 0; x < nc / ncols_interleaved; x++) {
1288
- const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
2552
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
1289
2553
  for (int m = 0; m < 4; m++) {
1290
2554
  for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1291
2555
  }
@@ -1298,7 +2562,7 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
1298
2562
  const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1299
2563
  const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1300
2564
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1301
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2565
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + (qk / 2) * 4]));
1302
2566
  }
1303
2567
  sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1304
2568
  }
@@ -1313,29 +2577,23 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
1313
2577
  }
1314
2578
  }
1315
2579
 
1316
- void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
1317
- float * GGML_RESTRICT s,
1318
- size_t bs,
1319
- const void * GGML_RESTRICT vx,
1320
- const void * GGML_RESTRICT vy,
1321
- int nr,
1322
- int nc) {
2580
+ void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1323
2581
  const int qk = QK8_0;
1324
2582
  const int nb = n / qk;
1325
- const int ncols_interleaved = 4;
1326
- const int blocklen = 4;
2583
+ const int ncols_interleaved = 16;
2584
+ const int blocklen = 1;
1327
2585
 
1328
2586
  assert(n % qk == 0);
1329
2587
  assert(nr % 4 == 0);
1330
2588
  assert(nc % ncols_interleaved == 0);
1331
2589
 
1332
- float sumf[4][4];
2590
+ float sumf[4][16];
1333
2591
  int sumi;
1334
2592
 
1335
2593
  for (int y = 0; y < nr / 4; y++) {
1336
2594
  const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1337
2595
  for (int x = 0; x < nc / ncols_interleaved; x++) {
1338
- const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
2596
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
1339
2597
  for (int m = 0; m < 4; m++) {
1340
2598
  for (int j = 0; j < ncols_interleaved; j++) {
1341
2599
  sumf[m][j] = 0.0;
@@ -1365,57 +2623,102 @@ void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
1365
2623
  }
1366
2624
  }
1367
2625
 
1368
- void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
1369
- float * GGML_RESTRICT s,
1370
- size_t bs,
1371
- const void * GGML_RESTRICT vx,
1372
- const void * GGML_RESTRICT vy,
1373
- int nr,
1374
- int nc) {
1375
- const int qk = QK8_0;
1376
- const int nb = n / qk;
1377
- const int ncols_interleaved = 4;
1378
- const int blocklen = 8;
1379
2626
 
1380
- assert(n % qk == 0);
2627
+ void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2628
+ assert(n % QK_K == 0);
1381
2629
  assert(nr % 4 == 0);
1382
- assert(nc % ncols_interleaved == 0);
2630
+ assert(nc % 16 == 0);
2631
+ const int nb = n / QK_K;
2632
+ const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
2633
+ const block_q8_Kx4 * y = (const block_q8_Kx4 *)vy;
2634
+
2635
+ const int sb_perm[16] = {
2636
+ 0, 4, 1, 5, 2, 6, 3, 7,
2637
+ 8, 12, 9, 13, 10, 14, 11, 15
2638
+ };
1383
2639
 
1384
- float sumf[4][4];
1385
- int sumi;
2640
+ // Iterate Rows in tiles of 4
2641
+ for (int row_tile = 0; row_tile < nr; row_tile += 4) {
2642
+ // Iterate Columns in tiles of 16
2643
+ for (int col_tile = 0; col_tile < nc; col_tile += 16) {
2644
+
2645
+ const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
2646
+ const block_q8_Kx4 * y_ptr = y + (row_tile / 4) * nb;
2647
+
2648
+ float sumf[4][16];
2649
+ memset(sumf, 0, sizeof(sumf));
2650
+
2651
+ for (int k_block = 0; k_block < nb; ++k_block) {
2652
+ int32_t isum[4][16];
2653
+ int32_t summs[4][16];
2654
+ memset(isum, 0, sizeof(isum));
2655
+ memset(summs, 0, sizeof(summs));
2656
+
2657
+ const uint8_t * qs_rhs = x_ptr[k_block].qs;
2658
+ const uint8_t * sc_rhs = x_ptr[k_block].scales;
2659
+ const int8_t * qs_lhs = y_ptr[k_block].qs;
2660
+ const int16_t * bs_lhs = y_ptr[k_block].bsums;
2661
+
2662
+ for (int sb = 0; sb < 16; ++sb) {
2663
+ int scale_offset = sb_perm[sb] * 16;
2664
+
2665
+ int byte_base;
2666
+ if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
2667
+ else byte_base = (sb % 2 == 0) ? 32 : 48;
2668
+ int shift = ((sb / 2) % 4) * 2;
2669
+
2670
+ for (int col = 0; col < 16; ++col) {
2671
+ uint8_t sc_val = sc_rhs[scale_offset + col];
2672
+ int32_t d_sb = sc_val & 0xF;
2673
+ int32_t m_sb = sc_val >> 4;
2674
+
2675
+ // Correction Term
2676
+ for (int r = 0; r < 4; ++r) {
2677
+ int bsum_idx = (sb / 4) * 16 + r * 4 + (sb % 4);
2678
+ summs[r][col] += bs_lhs[bsum_idx] * m_sb;
2679
+ }
1386
2680
 
1387
- for (int y = 0; y < nr / 4; y++) {
1388
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1389
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1390
- const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1391
- for (int m = 0; m < 4; m++) {
1392
- for (int j = 0; j < ncols_interleaved; j++) {
1393
- sumf[m][j] = 0.0;
1394
- }
1395
- }
1396
- for (int l = 0; l < nb; l++) {
1397
- for (int k = 0; k < (qk / blocklen); k++) {
1398
- for (int m = 0; m < 4; m++) {
1399
- for (int j = 0; j < ncols_interleaved; j++) {
1400
- sumi = 0;
1401
- for (int i = 0; i < blocklen; ++i) {
1402
- const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1403
- sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
2681
+ // Main Dot Product
2682
+ for (int l = 0; l < 16; ++l) {
2683
+ int qs_idx = (byte_base + l) * 16 + col;
2684
+ uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
2685
+
2686
+ // Calculate Q8 index for this specific k and row
2687
+ int k = sb * 16 + l;
2688
+ int q8_idx = (k / 4) * 16 + (k % 4);
2689
+
2690
+ for (int r = 0; r < 4; ++r) {
2691
+ // Add r*4 to jump to the correct row within the 4x4 chunk
2692
+ int8_t q8_val = qs_lhs[q8_idx + r * 4];
2693
+ isum[r][col] += q8_val * q2_val * d_sb;
1404
2694
  }
1405
- sumf[m][j] +=
1406
- sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1407
2695
  }
1408
2696
  }
1409
2697
  }
2698
+
2699
+ // Finalize K-Block
2700
+ for (int col = 0; col < 16; ++col) {
2701
+ float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
2702
+ float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
2703
+
2704
+ for (int r = 0; r < 4; ++r) {
2705
+ float d_lhs = y_ptr[k_block].d[r];
2706
+ float d_all = d_lhs * d_rhs;
2707
+ float d_min = d_lhs * dm_rhs;
2708
+ sumf[r][col] += (isum[r][col] * d_all) - (summs[r][col] * d_min);
2709
+ }
2710
+ }
1410
2711
  }
1411
- for (int m = 0; m < 4; m++) {
1412
- for (int j = 0; j < ncols_interleaved; j++) {
1413
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2712
+
2713
+ for (int r = 0; r < 4; ++r) {
2714
+ for (int col = 0; col < 16; ++col) {
2715
+ s[(row_tile + r) * bs + (col_tile + col)] = sumf[r][col];
1414
2716
  }
1415
2717
  }
1416
2718
  }
1417
2719
  }
1418
2720
  }
2721
+ #endif
1419
2722
 
1420
2723
  } // extern "C"
1421
2724
 
@@ -1498,16 +2801,212 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in
1498
2801
 
1499
2802
  uint64_t elems;
1500
2803
  memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1501
- elems ^= xor_mask;
2804
+ elems ^= xor_mask;
2805
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
2806
+ }
2807
+
2808
+ return out;
2809
+ }
2810
+
2811
+ static block_q4_0x16 make_block_q4_0x16(block_q4_0 * in, unsigned int blck_size_interleave) {
2812
+ block_q4_0x16 out;
2813
+
2814
+ for (int i = 0; i < 16; i++) {
2815
+ out.d[i] = in[i].d;
2816
+ }
2817
+
2818
+ const int end = QK4_0 * 8 / blck_size_interleave;
2819
+
2820
+ if (blck_size_interleave == 1) {
2821
+ const uint8_t xor_mask = 0x88;
2822
+ for (int i = 0; i < end; ++i) {
2823
+ int src_id = i % 16;
2824
+ int src_offset = i / 16;
2825
+ int dst_offset = i;
2826
+
2827
+ out.qs[dst_offset] = in[src_id].qs[src_offset] ^ xor_mask;
2828
+ }
2829
+ } else {
2830
+ GGML_ASSERT(false);
2831
+ }
2832
+
2833
+ return out;
2834
+ }
2835
+
2836
+ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
2837
+ block_q4_Kx8 out;
2838
+ //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
2839
+ for (int i = 0; i < 8; i++) {
2840
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
2841
+ }
2842
+
2843
+ for (int i = 0; i < 8; i++) {
2844
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
2845
+ }
2846
+
2847
+ const int end = QK_K * 4 / blck_size_interleave;
2848
+
2849
+ // Interleave Q4_K quants by taking 8 bytes at a time
2850
+ for (int i = 0; i < end; ++i) {
2851
+ int src_id = i % 8;
2852
+ int src_offset = (i / 8) * blck_size_interleave;
2853
+ int dst_offset = i * blck_size_interleave;
2854
+
2855
+ // buffer large enough for the max interleave block size (8 bytes)
2856
+ uint64_t elems;
2857
+ memcpy(&elems, &in[src_id].qs[src_offset], blck_size_interleave);
2858
+ memcpy(&out.qs[dst_offset], &elems, blck_size_interleave);
2859
+ }
2860
+
2861
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
2862
+ // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
2863
+ // The output Q4_Kx8 structure has 96 bytes
2864
+ // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
2865
+ // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
2866
+ uint8_t s[8], m[8];
2867
+
2868
+ for (int i = 0; i < 4; i++) {
2869
+ for (int j = 0; j < 8; j++) {
2870
+ s[j] = in[j].scales[i] & 63;
2871
+ m[j] = in[j].scales[i + 4] & 63;
2872
+ }
2873
+
2874
+ out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
2875
+ out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
2876
+ out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
2877
+ out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
2878
+ out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
2879
+ out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
2880
+ out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
2881
+ out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
2882
+ out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
2883
+ out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
2884
+ out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
2885
+ out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
2886
+
2887
+ }
2888
+
2889
+ for (int i = 0; i < 4; i++) {
2890
+ for (int j = 0; j < 8; j++) {
2891
+ s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
2892
+ m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
2893
+ }
2894
+
2895
+ out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
2896
+ out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
2897
+ out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
2898
+ out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
2899
+ out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
2900
+ out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
2901
+ out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
2902
+ out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
2903
+ out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
2904
+ out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
2905
+ out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
2906
+ out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
2907
+
2908
+ }
2909
+
2910
+ return out;
2911
+ }
2912
+
2913
+ static block_q4_Kx16 make_block_q4_Kx16(block_q4_K * in, unsigned int blck_size_interleave) {
2914
+ block_q4_Kx16 out;
2915
+ //Delta(scale) and dmin values of the 16 Q4_K structures are copied onto the output interleaved structure
2916
+ for (int i = 0; i < 16; i++) {
2917
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
2918
+ }
2919
+
2920
+ for (int i = 0; i < 16; i++) {
2921
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
2922
+ }
2923
+
2924
+ const int end = QK_K * 8 / blck_size_interleave;
2925
+
2926
+ if (blck_size_interleave == 1) {
2927
+ for (int i = 0; i < end; ++i) {
2928
+ int src_id = i % 16;
2929
+ int src_offset = i / 16;
2930
+ int dst_offset = i;
2931
+
2932
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
2933
+ }
2934
+
2935
+ // RVV repacking.
2936
+ //
2937
+ // Extract sums and mins for all 8 sub-blocks for each block of Q4_K.
2938
+ uint8_t s[128], m[128];
2939
+ for (int i = 0; i < 4; i++) {
2940
+ for (int j = 0; j < 16; j++) {
2941
+ s[i * 16 + j] = in[j].scales[i] & 63;
2942
+ m[i * 16 + j] = in[j].scales[i + 4] & 63;
2943
+ }
2944
+ }
2945
+ for (int i = 0; i < 4; i++) {
2946
+ for (int j = 0; j < 16; j++) {
2947
+ s[64 + i * 16 + j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
2948
+ m[64 + i * 16 + j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
2949
+ }
2950
+ }
2951
+
2952
+ for (int i = 0; i < 128; i++) {
2953
+ out.scales[i] = (s[i] & 15) | ((m[i] & 15) << 4);
2954
+ }
2955
+ for (int i = 0; i < 64; i++) {
2956
+ out.scales[128 + i] = ((s[i] & 48) >> 4) | ((m[i] & 48) >> 2) | (s[64 + i] & 48) | ((m[64 + i] & 48) << 2);
2957
+ }
2958
+ } else {
2959
+ GGML_ASSERT(false);
2960
+ }
2961
+
2962
+ return out;
2963
+ }
2964
+
2965
+ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
2966
+ block_q2_Kx8 out;
2967
+
2968
+ // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
2969
+ for (int i = 0; i < 8; i++) {
2970
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
2971
+ }
2972
+
2973
+ for (int i = 0; i < 8; i++) {
2974
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
2975
+ }
2976
+
2977
+ const int end = QK_K * 2 / blck_size_interleave;
2978
+
2979
+ // Interleave Q2_K quants by taking 8 bytes at a time
2980
+ for (int i = 0; i < end; ++i) {
2981
+ int src_id = i % 8;
2982
+ int src_offset = (i / 8) * blck_size_interleave;
2983
+ int dst_offset = i * blck_size_interleave;
2984
+
2985
+ uint64_t elems;
2986
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1502
2987
  memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
1503
2988
  }
1504
2989
 
2990
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
2991
+ // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
2992
+ // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
2993
+ // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
2994
+ // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
2995
+
2996
+ for (int i = 0; i < 128; i++) {
2997
+ // Index for selecting which q2k super block
2998
+ int src1 = (i % 16) / 2;
2999
+ // Index for selecting scale
3000
+ int src2 = ((i / 16) * 2) + (i % 2);
3001
+
3002
+ out.scales[i] = in[src1].scales[src2];
3003
+ }
1505
3004
  return out;
1506
3005
  }
1507
3006
 
1508
- static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
1509
- block_q4_Kx8 out;
1510
- //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
3007
+ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_interleave) {
3008
+ block_q5_Kx8 out;
3009
+ //Delta(scale) and dmin values of the eight Q5_K structures are copied onto the output interleaved structure
1511
3010
  for (int i = 0; i < 8; i++) {
1512
3011
  out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1513
3012
  }
@@ -1518,22 +3017,33 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
1518
3017
 
1519
3018
  const int end = QK_K * 4 / blck_size_interleave;
1520
3019
 
1521
- // Interleave Q4_K quants by taking 8 bytes at a time
3020
+ // Interleave Q5_K quants by taking blck_size_interleave bytes at a time
1522
3021
  for (int i = 0; i < end; ++i) {
1523
- int src_id = i % 8;
3022
+ int src_id = i % 8;
1524
3023
  int src_offset = (i / 8) * blck_size_interleave;
1525
3024
  int dst_offset = i * blck_size_interleave;
1526
3025
 
1527
- uint64_t elems;
1528
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1529
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
3026
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
1530
3027
  }
1531
3028
 
1532
- // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
1533
- // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
1534
- // The output Q4_Kx8 structure has 96 bytes
1535
- // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
1536
- // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
3029
+ // Repeat for high bits with the same chunk size, since
3030
+ // the high bits are interleaved in Q5_K and the index is
3031
+ // qh_idx = (qs_idx % 32);
3032
+ // qh_val = qh[qh_idx] >> (qs_idx / 32);
3033
+ for (int i = 0; i < end / 4; ++i) {
3034
+ int src_id = i % 8;
3035
+ int src_offset = (i / 8) * blck_size_interleave;
3036
+ int dst_offset = i * blck_size_interleave;
3037
+
3038
+ memcpy(&out.qh[dst_offset], &in[src_id].qh[src_offset], blck_size_interleave);
3039
+ }
3040
+
3041
+ // The below logic is copied over from Q4_K
3042
+ // The point is to unpack all the scales and mins for each sub block every time we load 12 bytes.
3043
+ // Currently the Q5_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
3044
+ // The output Q5_Kx8 structure has 96 bytes
3045
+ // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q5_K structure
3046
+ // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q5_K structures
1537
3047
  uint8_t s[8], m[8];
1538
3048
 
1539
3049
  for (int i = 0; i < 4; i++) {
@@ -1554,13 +3064,12 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
1554
3064
  out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
1555
3065
  out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
1556
3066
  out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
1557
-
1558
3067
  }
1559
3068
 
1560
3069
  for (int i = 0; i < 4; i++) {
1561
3070
  for (int j = 0; j < 8; j++) {
1562
- s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
1563
- m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
3071
+ s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i + 8] & 15);
3072
+ m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i + 8] & 240) >> 4);
1564
3073
  }
1565
3074
 
1566
3075
  out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
@@ -1575,54 +3084,117 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
1575
3084
  out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
1576
3085
  out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
1577
3086
  out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
1578
-
1579
3087
  }
1580
3088
 
1581
3089
  return out;
1582
3090
  }
1583
3091
 
1584
- static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
1585
- block_q2_Kx8 out;
3092
+ static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_interleave) {
3093
+ block_q6_Kx8 out;
3094
+ constexpr int n_blocks = 8; // Kx8
3095
+ for (int i = 0; i < n_blocks; i++) {
3096
+ out.d[i] = in[i].d;
3097
+ }
1586
3098
 
1587
- // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
1588
- for (int i = 0; i < 8; i++) {
1589
- out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
3099
+ const int end_ls = QK_K * 4 / blck_size_interleave;
3100
+ // Interleave Q6_K quants by taking blck_size_interleave bytes at a time
3101
+ for (int i = 0; i < end_ls; ++i) {
3102
+ int src_id = i % n_blocks;
3103
+ int src_offset = (i / n_blocks) * blck_size_interleave;
3104
+ int dst_offset = i * blck_size_interleave;
3105
+
3106
+ uint64_t elem_ls;
3107
+ memcpy(&elem_ls, &in[src_id].ql[src_offset], blck_size_interleave);
3108
+ memcpy(&out.ql[dst_offset], &elem_ls, blck_size_interleave);
1590
3109
  }
1591
3110
 
1592
- for (int i = 0; i < 8; i++) {
3111
+ // Interleave high bits using same chunk size as low bits
3112
+ const int end_hs = end_ls / 2;
3113
+ for (int i = 0; i < end_hs; ++i) {
3114
+ int src_id = i % n_blocks;
3115
+ int src_offset = (i / n_blocks) * blck_size_interleave;
3116
+ int dst_offset = i * blck_size_interleave;
3117
+
3118
+ uint64_t elem_hs;
3119
+ memcpy(&elem_hs, &in[src_id].qh[src_offset], blck_size_interleave);
3120
+ memcpy(&out.qh[dst_offset], &elem_hs, blck_size_interleave);
3121
+ }
3122
+
3123
+ // The below logic is designed so as to unpack and rearrange scales in Q6_K
3124
+ // The output Q6_Kx8 structure interleaves the 8 bit scales in the same fashion as the quants
3125
+ // Q6_K structure has an 8-bit scale per 16 elements -> 16 scales
3126
+ // scales: [0 bl0 0 bl1 ... 0 bl7][1 bl0 ... 1 bl7] ... [15 bl0 ... 15 bl7] (bl = block)
3127
+ constexpr int n_scales = QK_K / 16;
3128
+
3129
+ for (int i = 0; i < n_blocks; i++) {
3130
+ for (int j = 0; j < n_scales; j++) {
3131
+ out.scales[j * n_blocks + i] = in[i].scales[j];
3132
+ }
3133
+ }
3134
+
3135
+ return out;
3136
+ }
3137
+
3138
+ static block_q2_Kx16 make_block_q2_Kx16(const block_q2_K * in, unsigned int blck_size_interleave) {
3139
+ block_q2_Kx16 out;
3140
+ constexpr int N_COLS = 16;
3141
+
3142
+ // 1. Copy Super-Scales (d) and Super-Mins (dmin)
3143
+ for (int i = 0; i < N_COLS; i++) {
3144
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
1593
3145
  out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
1594
3146
  }
1595
3147
 
1596
- const int end = QK_K * 2 / blck_size_interleave;
3148
+ // 2. Interleave Q2_K Data
3149
+ const int bytes_per_col = 64;
3150
+ const int total_bytes = N_COLS * bytes_per_col;
3151
+ const int end = total_bytes / blck_size_interleave;
1597
3152
 
1598
- // Interleave Q2_K quants by taking 8 bytes at a time
1599
3153
  for (int i = 0; i < end; ++i) {
1600
- int src_id = i % 8;
1601
- int src_offset = (i / 8) * blck_size_interleave;
3154
+ int src_col_id = i % N_COLS;
3155
+ int src_offset = (i / N_COLS) * blck_size_interleave;
1602
3156
  int dst_offset = i * blck_size_interleave;
1603
-
1604
- uint64_t elems;
1605
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
1606
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
3157
+ memcpy(&out.qs[dst_offset], &in[src_col_id].qs[src_offset], blck_size_interleave);
1607
3158
  }
1608
3159
 
1609
- // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
1610
- // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
1611
- // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
1612
- // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
1613
- // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
3160
+ // 3. Repack Scales into the Optimized "Sequential-Parallel" Layout
3161
+ int out_idx = 0;
1614
3162
 
1615
- for(int i = 0; i < 128; i++){
3163
+ // Arrays define the sub-block order for each group
3164
+ const int even_low_sbs[] = {0, 2, 4, 6};
3165
+ const int odd_low_sbs[] = {1, 3, 5, 7};
3166
+ const int even_high_sbs[] = {8, 10, 12, 14};
3167
+ const int odd_high_sbs[] = {9, 11, 13, 15};
1616
3168
 
1617
- // Index for selecting which q2k super block
1618
- int src1 = (i % 16) / 2;
1619
- // Index for selecting scale
1620
- int src2 = ((i / 16) * 2) + (i % 2);
3169
+ // Pack Group 1: Even-Low
3170
+ for (int sb : even_low_sbs) {
3171
+ for (int col = 0; col < N_COLS; col++) {
3172
+ out.scales[out_idx++] = in[col].scales[sb];
3173
+ }
3174
+ }
1621
3175
 
1622
- out.scales[i] = in[src1].scales[src2];
3176
+ // Pack Group 2: Odd-Low
3177
+ for (int sb : odd_low_sbs) {
3178
+ for (int col = 0; col < N_COLS; col++) {
3179
+ out.scales[out_idx++] = in[col].scales[sb];
3180
+ }
3181
+ }
3182
+
3183
+ // Pack Group 3: Even-High
3184
+ for (int sb : even_high_sbs) {
3185
+ for (int col = 0; col < N_COLS; col++) {
3186
+ out.scales[out_idx++] = in[col].scales[sb];
3187
+ }
1623
3188
  }
1624
- return out;
1625
3189
 
3190
+ // Pack Group 4: Odd-High
3191
+ for (int sb : odd_high_sbs) {
3192
+ for (int col = 0; col < N_COLS; col++) {
3193
+ out.scales[out_idx++] = in[col].scales[sb];
3194
+ }
3195
+ }
3196
+
3197
+ return out;
1626
3198
  }
1627
3199
 
1628
3200
  static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
@@ -1687,6 +3259,36 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
1687
3259
  GGML_UNUSED(data_size);
1688
3260
  }
1689
3261
 
3262
+ static int repack_q4_K_to_q4_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3263
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
3264
+ constexpr int nrows_interleaved = 16;
3265
+
3266
+ block_q4_Kx16 * dst = (block_q4_Kx16*)t->data;
3267
+ const block_q4_K * src = (const block_q4_K*) data;
3268
+ block_q4_K dst_tmp[16];
3269
+ int nrow = ggml_nrows(t);
3270
+ int nblocks = t->ne[0] / QK_K;
3271
+
3272
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
3273
+
3274
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3275
+ return -1;
3276
+ }
3277
+
3278
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3279
+ for (int64_t x = 0; x < nblocks; x++) {
3280
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3281
+ dst_tmp[i] = src[x + i * nblocks];
3282
+ }
3283
+ *dst++ = make_block_q4_Kx16(dst_tmp, interleave_block);
3284
+ }
3285
+ src += nrows_interleaved * nblocks;
3286
+ }
3287
+ return 0;
3288
+
3289
+ GGML_UNUSED(data_size);
3290
+ }
3291
+
1690
3292
  static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1691
3293
  GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
1692
3294
  GGML_ASSERT(interleave_block == 8);
@@ -1706,7 +3308,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
1706
3308
 
1707
3309
  for (int b = 0; b < nrow; b += nrows_interleaved) {
1708
3310
  for (int64_t x = 0; x < nblocks; x++) {
1709
- for (int i = 0; i < nrows_interleaved; i++ ) {
3311
+ for (int i = 0; i < nrows_interleaved; i++) {
1710
3312
  dst_tmp[i] = src[x + i * nblocks];
1711
3313
  }
1712
3314
  *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
@@ -1718,6 +3320,132 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
1718
3320
  GGML_UNUSED(data_size);
1719
3321
  }
1720
3322
 
3323
+ static int repack_q2_K_to_q2_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3324
+ GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
3325
+ constexpr int nrows_interleaved = 16;
3326
+
3327
+ block_q2_Kx16 * dst = (block_q2_Kx16*)t->data;
3328
+ const block_q2_K * src = (const block_q2_K*) data;
3329
+
3330
+ block_q2_K dst_tmp[nrows_interleaved];
3331
+
3332
+ int nrow = ggml_nrows(t);
3333
+ int nblocks = t->ne[0] / QK_K;
3334
+
3335
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
3336
+
3337
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3338
+ return -1;
3339
+ }
3340
+
3341
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3342
+ for (int64_t x = 0; x < nblocks; x++) {
3343
+ // This loop gathers 16 separate blocks (one from each column)
3344
+ // that correspond to the same K-dimension chunk.
3345
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3346
+ dst_tmp[i] = src[x + i * nblocks];
3347
+ }
3348
+
3349
+ *dst++ = make_block_q2_Kx16(dst_tmp, interleave_block);
3350
+ }
3351
+ src += nrows_interleaved * nblocks;
3352
+ }
3353
+ return 0;
3354
+
3355
+ GGML_UNUSED(data_size);
3356
+ }
3357
+
3358
+ static int repack_q4_0_to_q4_0_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3359
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
3360
+ constexpr int nrows_interleaved = 16;
3361
+
3362
+ block_q4_0x16 * dst = (block_q4_0x16*)t->data;
3363
+ const block_q4_0 * src = (const block_q4_0*) data;
3364
+ block_q4_0 dst_tmp[16];
3365
+ int nrow = ggml_nrows(t);
3366
+ int nblocks = t->ne[0] / QK4_0;
3367
+
3368
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
3369
+
3370
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3371
+ return -1;
3372
+ }
3373
+
3374
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3375
+ for (int64_t x = 0; x < nblocks; x++) {
3376
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3377
+ dst_tmp[i] = src[x + i * nblocks];
3378
+ }
3379
+ *dst++ = make_block_q4_0x16(dst_tmp, interleave_block);
3380
+ }
3381
+ src += nrows_interleaved * nblocks;
3382
+ }
3383
+ return 0;
3384
+
3385
+ GGML_UNUSED(data_size);
3386
+ }
3387
+
3388
+ static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
3389
+ int interleave_block,
3390
+ const void * GGML_RESTRICT data,
3391
+ size_t data_size) {
3392
+ GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
3393
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3394
+ constexpr int nrows_interleaved = 8;
3395
+
3396
+ block_q5_Kx8 * dst = (block_q5_Kx8 *) t->data;
3397
+ const block_q5_K * src = (const block_q5_K *) data;
3398
+ block_q5_K dst_tmp[8];
3399
+ int nrow = ggml_nrows(t);
3400
+ int nblocks = t->ne[0] / QK_K;
3401
+
3402
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q5_K));
3403
+
3404
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3405
+ return -1;
3406
+ }
3407
+
3408
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3409
+ for (int64_t x = 0; x < nblocks; x++) {
3410
+ for (int i = 0; i < nrows_interleaved; i++) {
3411
+ dst_tmp[i] = src[x + i * nblocks];
3412
+ }
3413
+ *dst++ = make_block_q5_Kx8(dst_tmp, interleave_block);
3414
+ }
3415
+ src += nrows_interleaved * nblocks;
3416
+ }
3417
+ return 0;
3418
+ }
3419
+
3420
+ static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3421
+ GGML_ASSERT(t->type == GGML_TYPE_Q6_K);
3422
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3423
+ constexpr int nrows_interleaved = 8;
3424
+
3425
+ block_q6_Kx8 * dst = (block_q6_Kx8 *)t->data;
3426
+ const block_q6_K * src = (const block_q6_K *) data;
3427
+ block_q6_K dst_tmp[8];
3428
+ int nrow = ggml_nrows(t);
3429
+ int nblocks = t->ne[0] / QK_K;
3430
+
3431
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q6_K));
3432
+
3433
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3434
+ return -1;
3435
+ }
3436
+
3437
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3438
+ for (int64_t x = 0; x < nblocks; x++) {
3439
+ for (int i = 0; i < nrows_interleaved; i++) {
3440
+ dst_tmp[i] = src[x + i * nblocks];
3441
+ }
3442
+ *dst++ = make_block_q6_Kx8(dst_tmp, interleave_block);
3443
+ }
3444
+ src += nrows_interleaved * nblocks;
3445
+ }
3446
+ return 0;
3447
+ }
3448
+
1721
3449
  static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1722
3450
  GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
1723
3451
  GGML_ASSERT(interleave_block == 8);
@@ -1757,9 +3485,63 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
1757
3485
  GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1758
3486
  constexpr int nrows_interleaved = 4;
1759
3487
 
1760
- block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
3488
+ block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
3489
+ const block_q8_0 * src = (const block_q8_0 *) data;
3490
+ block_q8_0 dst_tmp[4];
3491
+ int nrow = ggml_nrows(t);
3492
+ int nblocks = t->ne[0] / QK8_0;
3493
+
3494
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
3495
+
3496
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3497
+ return -1;
3498
+ }
3499
+
3500
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3501
+ for (int64_t x = 0; x < nblocks; x++) {
3502
+ for (int i = 0; i < nrows_interleaved; i++) {
3503
+ dst_tmp[i] = src[x + i * nblocks];
3504
+ }
3505
+ *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
3506
+ }
3507
+ src += nrows_interleaved * nblocks;
3508
+ }
3509
+ return 0;
3510
+ }
3511
+
3512
+ static block_q8_0x16 make_block_q8_0x16(block_q8_0 * in, unsigned int blck_size_interleave) {
3513
+ block_q8_0x16 out;
3514
+
3515
+ for (int i = 0; i < 16; i++) {
3516
+ out.d[i] = in[i].d;
3517
+ }
3518
+
3519
+ const int end = QK8_0 * 16 / blck_size_interleave;
3520
+
3521
+ if (blck_size_interleave == 1) {
3522
+ for (int i = 0; i < end; ++i) {
3523
+ int src_id = i % 16;
3524
+ int src_offset = i / 16;
3525
+ int dst_offset = i;
3526
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
3527
+ }
3528
+ } else {
3529
+ GGML_ASSERT(false);
3530
+ }
3531
+
3532
+ return out;
3533
+ }
3534
+
3535
+ static int repack_q8_0_to_q8_0_16_bl(struct ggml_tensor * t,
3536
+ int interleave_block,
3537
+ const void * GGML_RESTRICT data,
3538
+ size_t data_size) {
3539
+ GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
3540
+ constexpr int nrows_interleaved = 16;
3541
+
3542
+ block_q8_0x16 * dst = (block_q8_0x16 *) t->data;
1761
3543
  const block_q8_0 * src = (const block_q8_0 *) data;
1762
- block_q8_0 dst_tmp[4];
3544
+ block_q8_0 dst_tmp[16];
1763
3545
  int nrow = ggml_nrows(t);
1764
3546
  int nblocks = t->ne[0] / QK8_0;
1765
3547
 
@@ -1774,7 +3556,7 @@ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
1774
3556
  for (int i = 0; i < nrows_interleaved; i++) {
1775
3557
  dst_tmp[i] = src[x + i * nblocks];
1776
3558
  }
1777
- *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
3559
+ *dst++ = make_block_q8_0x16(dst_tmp, interleave_block);
1778
3560
  }
1779
3561
  src += nrows_interleaved * nblocks;
1780
3562
  }
@@ -1906,6 +3688,177 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
1906
3688
  GGML_UNUSED(data_size);
1907
3689
  }
1908
3690
 
3691
+ static block_iq4_nlx16 make_block_iq4_nlx16(block_iq4_nl * in, unsigned int blck_size_interleave) {
3692
+ block_iq4_nlx16 out;
3693
+
3694
+ for (int i = 0; i < 16; i++) {
3695
+ out.d[i] = in[i].d;
3696
+ }
3697
+
3698
+ const int end = QK4_NL * 8 / blck_size_interleave;
3699
+
3700
+ if (blck_size_interleave == 1) {
3701
+ for (int i = 0; i < end; ++i) {
3702
+ int src_id = i % 16;
3703
+ int src_offset = i / 16;
3704
+ int dst_offset = i;
3705
+
3706
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
3707
+ }
3708
+ } else {
3709
+ GGML_ASSERT(false);
3710
+ }
3711
+
3712
+ return out;
3713
+ }
3714
+
3715
+ static int repack_iq4_nl_to_iq4_nl_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3716
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
3717
+ GGML_ASSERT(interleave_block == 1);
3718
+
3719
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
3720
+ block_iq4_nlx16 * dst = ( block_iq4_nlx16 *)t->data;
3721
+
3722
+ block_iq4_nl dst_tmp[16];
3723
+
3724
+ int nrow = ggml_nrows(t);
3725
+ int nrows_interleaved = 16;
3726
+ int nblocks = t->ne[0] / QK4_NL;
3727
+
3728
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
3729
+
3730
+ if (t->ne[1] % nrows_interleaved != 0) {
3731
+ return -1;
3732
+ }
3733
+
3734
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3735
+ for (int64_t x = 0; x < nblocks; x++) {
3736
+ for (int i = 0; i < nrows_interleaved; i++) {
3737
+ dst_tmp[i] = src[x + i * nblocks];
3738
+ }
3739
+ *dst++ = make_block_iq4_nlx16(dst_tmp, interleave_block);
3740
+ }
3741
+ src += nrows_interleaved * nblocks;
3742
+ }
3743
+ return 0;
3744
+
3745
+ GGML_UNUSED(data_size);
3746
+ }
3747
+
3748
+ static block_mxfp4x4 make_block_mxfp4x4(block_mxfp4 * in, unsigned int blck_size_interleave) {
3749
+ block_mxfp4x4 out;
3750
+
3751
+ for (int i = 0; i < 4; i++) {
3752
+ out.e[i] = in[i].e;
3753
+ }
3754
+
3755
+ const int end = QK_MXFP4 * 2 / blck_size_interleave;
3756
+
3757
+ if (blck_size_interleave == 4) {
3758
+ for (int i = 0; i < end; ++i) {
3759
+ int src_id = i % 4;
3760
+ int src_offset = (i / 4) * blck_size_interleave;
3761
+ int dst_offset = i * blck_size_interleave;
3762
+
3763
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
3764
+ }
3765
+ } else {
3766
+ GGML_ASSERT(false);
3767
+ }
3768
+
3769
+ return out;
3770
+ }
3771
+
3772
+ static int repack_mxfp4_to_mxfp4_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3773
+ GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
3774
+ GGML_ASSERT(interleave_block == 4);
3775
+
3776
+ const block_mxfp4 * src = (const block_mxfp4 *)data;
3777
+ block_mxfp4x4 * dst = ( block_mxfp4x4 *)t->data;
3778
+
3779
+ block_mxfp4 dst_tmp[4];
3780
+
3781
+ int nrow = ggml_nrows(t);
3782
+ int nrows_interleaved = 4;
3783
+ int nblocks = t->ne[0] / QK_MXFP4;
3784
+
3785
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
3786
+
3787
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3788
+ return -1;
3789
+ }
3790
+
3791
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3792
+ for (int64_t x = 0; x < nblocks; x++) {
3793
+ for (int i = 0; i < nrows_interleaved; i++) {
3794
+ dst_tmp[i] = src[x + i * nblocks];
3795
+ }
3796
+ *dst++ = make_block_mxfp4x4(dst_tmp, interleave_block);
3797
+ }
3798
+ src += nrows_interleaved * nblocks;
3799
+ }
3800
+ return 0;
3801
+
3802
+ GGML_UNUSED(data_size);
3803
+ }
3804
+
3805
+ static block_mxfp4x8 make_block_mxfp4x8(block_mxfp4 * in, unsigned int blck_size_interleave) {
3806
+ block_mxfp4x8 out;
3807
+
3808
+ for (int i = 0; i < 8; i++) {
3809
+ out.e[i] = in[i].e;
3810
+ }
3811
+
3812
+ const int end = QK_MXFP4 * 4 / blck_size_interleave;
3813
+
3814
+ if (blck_size_interleave == 8) {
3815
+ for (int i = 0; i < end; ++i) {
3816
+ int src_id = i % 8;
3817
+ int src_offset = (i / 8) * blck_size_interleave;
3818
+ int dst_offset = i * blck_size_interleave;
3819
+
3820
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
3821
+ }
3822
+ } else {
3823
+ GGML_ASSERT(false);
3824
+ }
3825
+
3826
+ return out;
3827
+ }
3828
+
3829
+ static int repack_mxfp4_to_mxfp4_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3830
+ GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
3831
+ GGML_ASSERT(interleave_block == 8);
3832
+
3833
+ const block_mxfp4 * src = (const block_mxfp4 *)data;
3834
+ block_mxfp4x8 * dst = ( block_mxfp4x8 *)t->data;
3835
+
3836
+ block_mxfp4 dst_tmp[8];
3837
+
3838
+ int nrow = ggml_nrows(t);
3839
+ int nrows_interleaved = 8;
3840
+ int nblocks = t->ne[0] / QK_MXFP4;
3841
+
3842
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
3843
+
3844
+ if (t->ne[1] % nrows_interleaved != 0) {
3845
+ return -1;
3846
+ }
3847
+
3848
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3849
+ for (int64_t x = 0; x < nblocks; x++) {
3850
+ for (int i = 0; i < nrows_interleaved; i++) {
3851
+ dst_tmp[i] = src[x + i * nblocks];
3852
+ }
3853
+ *dst++ = make_block_mxfp4x8(dst_tmp, interleave_block);
3854
+ }
3855
+ src += nrows_interleaved * nblocks;
3856
+ }
3857
+ return 0;
3858
+
3859
+ GGML_UNUSED(data_size);
3860
+ }
3861
+
1909
3862
  namespace ggml::cpu::repack {
1910
3863
  // repack
1911
3864
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@@ -1936,6 +3889,22 @@ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * da
1936
3889
  return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
1937
3890
  }
1938
3891
 
3892
+ template <> int repack<block_q5_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3893
+ return repack_q5_K_to_q5_K_8_bl(t, 4, data, data_size);
3894
+ }
3895
+
3896
+ template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3897
+ return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
3898
+ }
3899
+
3900
+ template <> int repack<block_q6_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3901
+ return repack_q6_K_to_q6_K_8_bl(t, 4, data, data_size);
3902
+ }
3903
+
3904
+ template <> int repack<block_q6_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3905
+ return repack_q6_K_to_q6_K_8_bl(t, 8, data, data_size);
3906
+ }
3907
+
1939
3908
  template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1940
3909
  return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
1941
3910
  }
@@ -1949,6 +3918,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
1949
3918
  return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1950
3919
  }
1951
3920
 
3921
+ template <> int repack<block_mxfp4, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
3922
+ return repack_mxfp4_to_mxfp4_4_bl(t, 4, data, data_size);
3923
+ }
3924
+
3925
+ template <> int repack<block_mxfp4, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3926
+ return repack_mxfp4_to_mxfp4_8_bl(t, 8, data, data_size);
3927
+ }
3928
+
1952
3929
  template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1953
3930
  return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
1954
3931
  }
@@ -1957,6 +3934,28 @@ template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * da
1957
3934
  return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
1958
3935
  }
1959
3936
 
3937
+ #if defined __riscv_zvfh
3938
+ template <> int repack<block_q4_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3939
+ return repack_q4_0_to_q4_0_16_bl(t, 1, data, data_size);
3940
+ }
3941
+
3942
+ template <> int repack<block_q4_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3943
+ return repack_q4_K_to_q4_K_16_bl(t, 1, data, data_size);
3944
+ }
3945
+
3946
+ template <> int repack<block_iq4_nl, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3947
+ return repack_iq4_nl_to_iq4_nl_16_bl(t, 1, data, data_size);
3948
+ }
3949
+
3950
+ template <> int repack<block_q8_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3951
+ return repack_q8_0_to_q8_0_16_bl(t, 1, data, data_size);
3952
+ }
3953
+
3954
+ template <> int repack<block_q2_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3955
+ return repack_q2_K_to_q2_K_16_bl(t, 1, data, data_size);
3956
+ }
3957
+ #endif
3958
+
1960
3959
  // gemv
1961
3960
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1962
3961
  void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1973,6 +3972,17 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
1973
3972
  ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1974
3973
  }
1975
3974
 
3975
+ template <>
3976
+ void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n,
3977
+ float * s,
3978
+ size_t bs,
3979
+ const void * vx,
3980
+ const void * vy,
3981
+ int nr,
3982
+ int nc) {
3983
+ ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
3984
+ }
3985
+
1976
3986
  template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1977
3987
  ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
1978
3988
  }
@@ -1981,8 +3991,20 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
1981
3991
  ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1982
3992
  }
1983
3993
 
1984
- template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1985
- ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
3994
+ template <> void gemv<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3995
+ ggml_gemv_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
3996
+ }
3997
+
3998
+ template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3999
+ ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4000
+ }
4001
+
4002
+ template <> void gemv<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4003
+ ggml_gemv_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4004
+ }
4005
+
4006
+ template <> void gemv<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4007
+ ggml_gemv_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
1986
4008
  }
1987
4009
 
1988
4010
  template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
@@ -1993,6 +4015,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
1993
4015
  ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1994
4016
  }
1995
4017
 
4018
+ template <> void gemv<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4019
+ ggml_gemv_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4020
+ }
4021
+
4022
+ template <> void gemv<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4023
+ ggml_gemv_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4024
+ }
4025
+
1996
4026
  template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1997
4027
  ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1998
4028
  }
@@ -2001,6 +4031,28 @@ template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
2001
4031
  ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2002
4032
  }
2003
4033
 
4034
+ #if defined __riscv_zvfh
4035
+ template <> void gemv<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4036
+ ggml_gemv_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4037
+ }
4038
+
4039
+ template <> void gemv<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4040
+ ggml_gemv_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4041
+ }
4042
+
4043
+ template <> void gemv<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4044
+ ggml_gemv_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4045
+ }
4046
+
4047
+ template <> void gemv<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4048
+ ggml_gemv_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4049
+ }
4050
+
4051
+ template <> void gemv<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4052
+ ggml_gemv_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4053
+ }
4054
+ #endif
4055
+
2004
4056
  // gemm
2005
4057
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
2006
4058
  void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -2013,20 +4065,43 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
2013
4065
  ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2014
4066
  }
2015
4067
 
2016
- template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2017
- ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4068
+ template <>
4069
+ void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n,
4070
+ float * s,
4071
+ size_t bs,
4072
+ const void * vx,
4073
+ const void * vy,
4074
+ int nr,
4075
+ int nc) {
4076
+ ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
2018
4077
  }
2019
4078
 
2020
- template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2021
- ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4079
+ template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4080
+ ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4081
+ }
4082
+
4083
+ template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4084
+ ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
2022
4085
  }
2023
4086
 
2024
4087
  template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2025
4088
  ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
2026
4089
  }
2027
4090
 
2028
- template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2029
- ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4091
+ template <> void gemm<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4092
+ ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4093
+ }
4094
+
4095
+ template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4096
+ ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4097
+ }
4098
+
4099
+ template <> void gemm<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4100
+ ggml_gemm_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4101
+ }
4102
+
4103
+ template <> void gemm<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4104
+ ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
2030
4105
  }
2031
4106
 
2032
4107
  template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
@@ -2037,6 +4112,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
2037
4112
  ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
2038
4113
  }
2039
4114
 
4115
+ template <> void gemm<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4116
+ ggml_gemm_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4117
+ }
4118
+
4119
+ template <> void gemm<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4120
+ ggml_gemm_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4121
+ }
4122
+
2040
4123
  template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2041
4124
  ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
2042
4125
  }
@@ -2045,6 +4128,28 @@ template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
2045
4128
  ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2046
4129
  }
2047
4130
 
4131
+ #if defined __riscv_zvfh
4132
+ template <> void gemm<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4133
+ ggml_gemm_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4134
+ }
4135
+
4136
+ template <> void gemm<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4137
+ ggml_gemm_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4138
+ }
4139
+
4140
+ template <> void gemm<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4141
+ ggml_gemm_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4142
+ }
4143
+
4144
+ template <> void gemm<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4145
+ ggml_gemm_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4146
+ }
4147
+
4148
+ template <> void gemm<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4149
+ ggml_gemm_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4150
+ }
4151
+ #endif
4152
+
2048
4153
  class tensor_traits_base : public ggml::cpu::tensor_traits {
2049
4154
  public:
2050
4155
  virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -2063,7 +4168,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
2063
4168
  case GGML_OP_MUL_MAT_ID:
2064
4169
  {
2065
4170
  size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
2066
- size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
4171
+ size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
2067
4172
 
2068
4173
  const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
2069
4174
  const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
@@ -2328,7 +4433,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
2328
4433
  auto * wdata = (char *)params->wdata;
2329
4434
  auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
2330
4435
 
2331
- // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
4436
+ // total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
2332
4437
  auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
2333
4438
  struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
2334
4439
 
@@ -2393,20 +4498,19 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
2393
4498
  for (int ir1 = 0; ir1 < nr1; ir1++) {
2394
4499
  struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
2395
4500
 
2396
- const int id = row_mapping.i1; // selected expert index
4501
+ const int id = row_mapping.i1; // selected expert index
2397
4502
 
2398
4503
  const int64_t i11 = id % ne11;
2399
- const int64_t i12 = row_mapping.i2; // row index in src1
4504
+ const int64_t i12 = row_mapping.i2; // row index in src1
2400
4505
 
2401
- const int64_t i1 = id; // selected expert index
2402
- const int64_t i2 = i12; // row
4506
+ const int64_t i1 = id; // selected expert index
4507
+ const int64_t i2 = i12; // row
2403
4508
 
2404
4509
  const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
2405
4510
 
2406
- gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
2407
- (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
2408
- src0_cur + src0_cur_start * nb01,
2409
- src1_col, 1, src0_cur_end - src0_cur_start);
4511
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(
4512
+ ne00, (float *) ((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
4513
+ src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
2410
4514
  }
2411
4515
  }
2412
4516
  #undef MMID_MATRIX_ROW
@@ -2422,7 +4526,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
2422
4526
  } // namespace ggml::cpu::repack
2423
4527
 
2424
4528
  static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
2425
-
2426
4529
  // instance for Q4
2427
4530
  static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
2428
4531
  static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
@@ -2432,6 +4535,14 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2432
4535
  static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
2433
4536
  static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
2434
4537
 
4538
+ // instance for Q5_K
4539
+ static const ggml::cpu::repack::tensor_traits<block_q5_K, 4, 8, GGML_TYPE_Q8_K> q5_K_8x4_q8_K;
4540
+ static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
4541
+
4542
+ // instance for Q6_K
4543
+ static const ggml::cpu::repack::tensor_traits<block_q6_K, 4, 8, GGML_TYPE_Q8_K> q6_K_8x4_q8_K;
4544
+ static const ggml::cpu::repack::tensor_traits<block_q6_K, 8, 8, GGML_TYPE_Q8_K> q6_K_8x8_q8_K;
4545
+
2435
4546
  // instance for Q2
2436
4547
  static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
2437
4548
 
@@ -2439,13 +4550,28 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2439
4550
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
2440
4551
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
2441
4552
 
4553
+ // instance for MXFP4
4554
+ static const ggml::cpu::repack::tensor_traits<block_mxfp4, 4, 4, GGML_TYPE_Q8_0> mxfp4_4x4_q8_0;
4555
+ static const ggml::cpu::repack::tensor_traits<block_mxfp4, 8, 8, GGML_TYPE_Q8_0> mxfp4_8x8_q8_0;
4556
+
2442
4557
  // instance for Q8_0
2443
4558
  static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
2444
4559
  static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
2445
4560
 
4561
+ // instances for RISC-V
4562
+ //
4563
+ // These implement outer-product style matrix multiplication kernels with
4564
+ // an interleave of 1.
4565
+ #if defined __riscv_zvfh
4566
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 1, 16, GGML_TYPE_Q8_0> q4_0_16x1_q8_0;
4567
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 1, 16, GGML_TYPE_Q8_K> q4_K_16x1_q8_K;
4568
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0> iq4_nl_16x1_q8_0;
4569
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 1, 16, GGML_TYPE_Q8_0> q8_0_16x1_q8_0;
4570
+ static const ggml::cpu::repack::tensor_traits<block_q2_K, 1, 16, GGML_TYPE_Q8_K> q2_K_16x1_q8_K;
4571
+ #endif
4572
+
2446
4573
  if (cur->type == GGML_TYPE_Q4_0) {
2447
- if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
2448
- || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
4574
+ if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
2449
4575
  if (cur->ne[1] % 8 == 0) {
2450
4576
  return &q4_0_8x8_q8_0;
2451
4577
  }
@@ -2460,6 +4586,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2460
4586
  return &q4_0_4x4_q8_0;
2461
4587
  }
2462
4588
  }
4589
+ if (ggml_cpu_has_riscv_v()) {
4590
+ #if defined __riscv_zvfh
4591
+ switch (__riscv_vlenb() * 8) {
4592
+ case 128: { break; } // TODO
4593
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q4_0_16x1_q8_0; } break; }
4594
+ case 512: { break; } // TODO
4595
+ case 1024: { break; } // TODO
4596
+ default: { return nullptr; }
4597
+ }
4598
+ #endif
4599
+ }
2463
4600
  } else if (cur->type == GGML_TYPE_Q4_K) {
2464
4601
  if (ggml_cpu_has_avx2()) {
2465
4602
  if (cur->ne[1] % 8 == 0) {
@@ -2476,12 +4613,56 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2476
4613
  return &q4_K_8x4_q8_K;
2477
4614
  }
2478
4615
  }
4616
+ if (ggml_cpu_has_riscv_v()) {
4617
+ #if defined __riscv_zvfh
4618
+ switch (__riscv_vlenb() * 8) {
4619
+ case 128: { break; } // TODO
4620
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q4_K_16x1_q8_K; } break; }
4621
+ case 512: { break; } // TODO
4622
+ case 1024: { break; } // TODO
4623
+ default: { return nullptr; }
4624
+ }
4625
+ #endif
4626
+ }
2479
4627
  } else if (cur->type == GGML_TYPE_Q2_K) {
2480
4628
  if (ggml_cpu_has_avx512()) {
2481
4629
  if (cur->ne[1] % 8 == 0) {
2482
4630
  return &q2_K_8x8_q8_K;
2483
4631
  }
2484
4632
  }
4633
+ if (ggml_cpu_has_riscv_v()) {
4634
+ #if defined __riscv_zvfh
4635
+ switch (__riscv_vlenb() * 8) {
4636
+ case 128: { break; } // TODO
4637
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q2_K_16x1_q8_K; } break; }
4638
+ case 512: { break; } // TODO
4639
+ case 1024: { break; } // TODO
4640
+ default: { return nullptr; }
4641
+ }
4642
+ #endif
4643
+ }
4644
+ } else if (cur->type == GGML_TYPE_Q5_K) {
4645
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
4646
+ if (cur->ne[1] % 8 == 0) {
4647
+ return &q5_K_8x8_q8_K;
4648
+ }
4649
+ }
4650
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4651
+ if (cur->ne[1] % 8 == 0) {
4652
+ return &q5_K_8x4_q8_K;
4653
+ }
4654
+ }
4655
+ } else if (cur->type == GGML_TYPE_Q6_K) {
4656
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
4657
+ if (cur->ne[1] % 8 == 0) {
4658
+ return &q6_K_8x8_q8_K;
4659
+ }
4660
+ }
4661
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4662
+ if (cur->ne[1] % 8 == 0) {
4663
+ return &q6_K_8x4_q8_K;
4664
+ }
4665
+ }
2485
4666
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
2486
4667
  if (ggml_cpu_has_avx2()) {
2487
4668
  if (cur->ne[1] % 8 == 0) {
@@ -2493,6 +4674,28 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2493
4674
  return &iq4_nl_4x4_q8_0;
2494
4675
  }
2495
4676
  }
4677
+ if (ggml_cpu_has_riscv_v()) {
4678
+ #if defined __riscv_zvfh
4679
+ switch (__riscv_vlenb() * 8) {
4680
+ case 128: { break; } // TODO
4681
+ case 256: { if (cur->ne[1] % 16 == 0) { return &iq4_nl_16x1_q8_0; } break; }
4682
+ case 512: { break; } // TODO
4683
+ case 1024: { break; } // TODO
4684
+ default: { return nullptr; }
4685
+ }
4686
+ #endif
4687
+ }
4688
+ } else if (cur->type == GGML_TYPE_MXFP4) {
4689
+ if (ggml_cpu_has_avx2()) {
4690
+ if (cur->ne[1] % 8 == 0) {
4691
+ return &mxfp4_8x8_q8_0;
4692
+ }
4693
+ }
4694
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4695
+ if (cur->ne[1] % 4 == 0) {
4696
+ return &mxfp4_4x4_q8_0;
4697
+ }
4698
+ }
2496
4699
  } else if (cur->type == GGML_TYPE_Q8_0) {
2497
4700
  if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2498
4701
  if (cur->ne[1] % 4 == 0) {
@@ -2504,6 +4707,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2504
4707
  return &q8_0_4x4_q8_0;
2505
4708
  }
2506
4709
  }
4710
+ if (ggml_cpu_has_riscv_v()) {
4711
+ #if defined __riscv_zvfh
4712
+ switch (__riscv_vlenb() * 8) {
4713
+ case 128: { break; } // TODO
4714
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q8_0_16x1_q8_0; } break; }
4715
+ case 512: { break; } // TODO
4716
+ case 1024: { break; } // TODO
4717
+ default: { return nullptr; }
4718
+ }
4719
+ #endif
4720
+ }
2507
4721
  }
2508
4722
 
2509
4723
  return nullptr;