whispercpp 1.3.5 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1017) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/LICENSE +1 -1
  5. data/README.md +133 -3
  6. data/Rakefile +18 -3
  7. data/ext/dependencies.rb +10 -4
  8. data/ext/dependencies_for_windows.rb +17 -0
  9. data/ext/extconf.rb +20 -7
  10. data/ext/options.rb +54 -14
  11. data/ext/options_for_windows.rb +51 -0
  12. data/ext/ruby_whisper.c +56 -46
  13. data/ext/ruby_whisper.h +165 -2
  14. data/ext/ruby_whisper_context.c +297 -126
  15. data/ext/ruby_whisper_context_params.c +163 -0
  16. data/ext/ruby_whisper_log_queue.c +180 -0
  17. data/ext/ruby_whisper_log_settable.h +47 -0
  18. data/ext/ruby_whisper_model.c +0 -1
  19. data/ext/ruby_whisper_parakeet.c +49 -0
  20. data/ext/ruby_whisper_parakeet_context.c +304 -0
  21. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  22. data/ext/ruby_whisper_parakeet_model.c +84 -0
  23. data/ext/ruby_whisper_parakeet_params.c +548 -0
  24. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  25. data/ext/ruby_whisper_parakeet_token.c +188 -0
  26. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  27. data/ext/ruby_whisper_params.c +256 -66
  28. data/ext/ruby_whisper_segment.c +6 -7
  29. data/ext/ruby_whisper_token.c +29 -9
  30. data/ext/ruby_whisper_transcribe.cpp +46 -16
  31. data/ext/ruby_whisper_vad_context.c +48 -1
  32. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  33. data/ext/ruby_whisper_vad_params.c +0 -1
  34. data/ext/ruby_whisper_vad_segment.c +0 -1
  35. data/ext/ruby_whisper_vad_segments.c +0 -1
  36. data/ext/sources/CMakeLists.txt +41 -3
  37. data/ext/sources/CMakePresets.json +95 -0
  38. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  39. data/ext/sources/cmake/parakeet.pc.in +10 -0
  40. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  41. data/ext/sources/cmake/whisper.pc.in +1 -1
  42. data/ext/sources/examples/CMakeLists.txt +4 -2
  43. data/ext/sources/examples/bench/bench.cpp +24 -19
  44. data/ext/sources/examples/cli/cli.cpp +51 -9
  45. data/ext/sources/examples/common-ggml.cpp +4 -0
  46. data/ext/sources/examples/common-whisper.cpp +139 -67
  47. data/ext/sources/examples/common-whisper.h +11 -0
  48. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  49. data/ext/sources/examples/miniaudio.h +4507 -2131
  50. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  51. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  52. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  53. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  54. data/ext/sources/examples/server/server.cpp +213 -163
  55. data/ext/sources/ggml/CMakeLists.txt +29 -15
  56. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  57. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  58. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  59. data/ext/sources/ggml/include/ggml-backend.h +73 -11
  60. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  61. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  62. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  63. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  64. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  65. data/ext/sources/ggml/include/ggml-rpc.h +8 -3
  66. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  67. data/ext/sources/ggml/include/ggml.h +155 -16
  68. data/ext/sources/ggml/include/gguf.h +10 -2
  69. data/ext/sources/ggml/src/CMakeLists.txt +25 -5
  70. data/ext/sources/ggml/src/ggml-alloc.c +9 -10
  71. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  72. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  73. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  74. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +40 -86
  76. data/ext/sources/ggml/src/ggml-backend.cpp +114 -10
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -2
  79. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +1016 -442
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +111 -85
  83. data/ext/sources/ggml/src/ggml-cann/common.h +23 -14
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +255 -92
  85. data/ext/sources/ggml/src/ggml-common.h +22 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +68 -34
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +44 -19
  88. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  89. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +101 -101
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +194 -1
  91. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2874 -613
  92. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +5480 -840
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1361 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -11
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +186 -36
  99. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +119 -19
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +112 -26
  101. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  102. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  103. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -0
  105. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +153 -16
  106. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +17 -0
  107. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  108. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +976 -251
  109. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +671 -266
  110. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1277 -263
  111. data/ext/sources/ggml/src/ggml-cpu/ops.h +4 -0
  112. data/ext/sources/ggml/src/ggml-cpu/quants.c +95 -0
  113. data/ext/sources/ggml/src/ggml-cpu/quants.h +6 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2893 -679
  115. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  116. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  117. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +114 -19
  118. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  119. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  120. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  121. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  122. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  123. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  124. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  125. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  126. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  127. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  128. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  129. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  130. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  131. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  132. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  133. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  134. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +54 -53
  135. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  136. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +18 -8
  137. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  138. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  139. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +73 -28
  140. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +69 -41
  141. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  142. data/ext/sources/ggml/src/ggml-cuda/common.cuh +359 -29
  143. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  144. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  145. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  146. data/ext/sources/ggml/src/ggml-cuda/convert.cu +94 -27
  147. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  148. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +20 -9
  149. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  150. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +333 -85
  151. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +632 -190
  152. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  153. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +162 -49
  154. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +43 -18
  155. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +44 -14
  156. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  157. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +241 -23
  158. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  159. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  160. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  161. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +312 -0
  162. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  163. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  164. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1454 -599
  165. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  166. data/ext/sources/ggml/src/ggml-cuda/mean.cu +13 -10
  167. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +397 -183
  168. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  169. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +161 -88
  170. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  171. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +522 -431
  172. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +139 -72
  173. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  174. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +608 -88
  175. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -0
  176. data/ext/sources/ggml/src/ggml-cuda/norm.cu +47 -79
  177. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  178. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  179. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +134 -27
  180. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  181. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +7 -17
  182. data/ext/sources/ggml/src/ggml-cuda/rope.cu +244 -137
  183. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  184. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  185. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  186. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  187. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  188. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  189. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  190. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +96 -40
  191. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  192. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  193. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +2 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +2 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +2 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +2 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -5
  226. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +202 -135
  227. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  228. data/ext/sources/ggml/src/ggml-cuda/unary.cu +86 -2
  229. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +4 -0
  230. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +111 -17
  231. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  232. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +30 -2
  233. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +84 -46
  235. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1612 -753
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +51 -11
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +361 -261
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +294 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +753 -241
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +295 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +471 -296
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +159 -53
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +3 -3
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  254. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  255. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  256. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  257. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  258. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  259. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  260. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  261. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  262. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +97 -14
  263. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +163 -67
  264. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  265. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  266. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  267. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  268. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  269. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  270. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  271. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  272. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  273. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  274. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  275. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  276. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  278. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -1348
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +547 -635
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +3556 -1101
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +475 -269
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +94 -72
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +222 -217
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +432 -0
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +886 -117
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  297. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  302. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +28 -9
  303. data/ext/sources/ggml/src/ggml-impl.h +68 -1
  304. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  305. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  306. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  307. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  308. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +409 -83
  309. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +54 -5
  310. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +254 -52
  311. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +254 -23
  312. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +756 -285
  313. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +7 -4
  314. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +359 -133
  315. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1867 -1123
  316. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  317. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +71 -4
  318. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +14127 -5314
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +97 -88
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +104 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1978 -67
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  367. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  368. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  369. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  370. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  371. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  372. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  373. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  374. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  375. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  376. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  377. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  378. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  379. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  380. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  381. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  382. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  383. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  384. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  385. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  386. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  387. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  388. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  389. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +178 -0
  390. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  391. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  392. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  393. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  394. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  395. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  396. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  397. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  398. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  399. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  400. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  401. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  402. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  403. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  404. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  405. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  406. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  407. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  412. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  413. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  414. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  415. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  416. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  417. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  418. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  419. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  420. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  421. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  422. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  423. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  424. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  425. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  426. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  427. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  428. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  429. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  430. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  431. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  432. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  433. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  434. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  435. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  436. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  437. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  438. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  439. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  440. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  441. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  442. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  443. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  444. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +880 -0
  445. data/ext/sources/ggml/src/ggml-openvino/utils.h +143 -0
  446. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  447. data/ext/sources/ggml/src/ggml-quants.c +385 -119
  448. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  449. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  450. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  451. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  452. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  453. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +64 -91
  454. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  455. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +4 -1
  456. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  457. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  458. data/ext/sources/ggml/src/ggml-sycl/common.hpp +356 -11
  459. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +184 -14
  460. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +31 -1
  461. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  462. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  463. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  464. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  465. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  466. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  467. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  468. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  469. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +77 -156
  470. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -2
  471. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  472. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  473. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  474. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  475. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  476. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  477. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +227 -0
  478. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  479. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  480. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  481. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +347 -0
  482. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  483. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  484. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  485. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +1134 -236
  486. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  487. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  488. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  489. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  490. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  491. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  492. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  493. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  494. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +72 -1
  495. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  496. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  497. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  498. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  499. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  500. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  501. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  502. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  503. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  504. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  505. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  506. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  507. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  508. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  509. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  510. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  511. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  512. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  513. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  514. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  515. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  516. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  517. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  518. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  519. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  520. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  521. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  522. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  523. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  524. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  525. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  526. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  527. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  530. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  531. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  532. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  533. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  534. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  535. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  536. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  537. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  538. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  539. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  540. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  541. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  542. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  543. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  544. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  545. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  546. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  547. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  548. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  549. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  550. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  551. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  552. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  553. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  554. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  555. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +228 -53
  556. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  557. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  558. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  559. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  560. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  561. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  562. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  563. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  564. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  565. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  566. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  567. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  568. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  569. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  570. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  571. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  572. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  573. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  574. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  575. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  576. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  577. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  578. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  579. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  580. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  581. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  582. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  583. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  584. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  585. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  586. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  587. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  588. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  589. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  590. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  591. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  592. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  593. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  594. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  595. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  596. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  597. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  599. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3250 -940
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +533 -180
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +113 -68
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +412 -222
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +222 -83
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +189 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +22 -20
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +51 -14
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +39 -63
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +13 -7
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +27 -11
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -149
  663. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  664. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3221 -97
  665. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3493 -1997
  666. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  667. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  668. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  669. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  670. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  671. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +142 -0
  672. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +115 -141
  673. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +93 -0
  674. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  675. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -44
  676. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  677. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +198 -230
  678. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  679. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  680. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  681. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  682. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  683. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  684. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +234 -335
  685. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  686. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  687. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +871 -42
  688. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  689. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  690. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  691. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  692. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +36 -138
  693. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +151 -0
  694. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  695. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  696. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  697. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  698. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  699. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  700. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  701. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  702. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  703. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +15 -40
  704. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  705. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +39 -12
  706. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  707. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  708. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  709. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  710. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  711. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  712. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +213 -0
  713. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  714. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +24 -15
  715. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  716. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +253 -16
  717. data/ext/sources/ggml/src/ggml.c +268 -52
  718. data/ext/sources/ggml/src/gguf.cpp +377 -47
  719. data/ext/sources/include/parakeet.h +342 -0
  720. data/ext/sources/include/whisper.h +10 -0
  721. data/ext/sources/media/matmul.png +0 -0
  722. data/ext/sources/src/CMakeLists.txt +23 -0
  723. data/ext/sources/src/parakeet-arch.h +188 -0
  724. data/ext/sources/src/parakeet.cpp +3838 -0
  725. data/ext/sources/src/whisper.cpp +62 -40
  726. data/extsources.rb +26 -10
  727. data/lib/whisper/log_settable.rb +36 -0
  728. data/lib/whisper/model/uri.rb +13 -1
  729. data/lib/whisper/output.rb +74 -0
  730. data/sig/whisper.rbs +445 -55
  731. data/test/helper.rb +2 -0
  732. data/test/jfk_reader/jfk_reader.c +50 -7
  733. data/test/test_callback.rb +1 -0
  734. data/test/test_context_params.rb +82 -0
  735. data/test/test_package.rb +6 -5
  736. data/test/test_parakeet.rb +28 -0
  737. data/test/test_parakeet_callback.rb +107 -0
  738. data/test/test_parakeet_context.rb +116 -0
  739. data/test/test_parakeet_context_params.rb +24 -0
  740. data/test/test_parakeet_model.rb +21 -0
  741. data/test/test_parakeet_params.rb +78 -0
  742. data/test/test_parakeet_segment.rb +42 -0
  743. data/test/test_parakeet_token.rb +73 -0
  744. data/test/test_params.rb +2 -0
  745. data/test/test_token.rb +11 -0
  746. data/test/test_vad_context.rb +58 -8
  747. data/test/test_vad_segment.rb +1 -1
  748. data/test/test_whisper.rb +44 -6
  749. data/whispercpp.gemspec +2 -2
  750. metadata +426 -280
  751. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  752. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  753. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  754. data/ext/sources/bindings/javascript/package.json +0 -26
  755. data/ext/sources/bindings/javascript/whisper.js +0 -19
  756. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  757. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  758. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  759. data/ext/sources/examples/addon.node/index.js +0 -59
  760. data/ext/sources/examples/addon.node/package.json +0 -16
  761. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  762. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  763. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  764. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  765. data/ext/sources/examples/coi-serviceworker.js +0 -146
  766. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  767. data/ext/sources/examples/command/command.cpp +0 -802
  768. data/ext/sources/examples/command/commands.txt +0 -9
  769. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  770. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  771. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  772. data/ext/sources/examples/generate-karaoke.sh +0 -57
  773. data/ext/sources/examples/helpers.js +0 -191
  774. data/ext/sources/examples/livestream.sh +0 -112
  775. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  776. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  777. data/ext/sources/examples/lsp/whisper.vim +0 -362
  778. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  779. data/ext/sources/examples/python/whisper_processor.py +0 -54
  780. data/ext/sources/examples/server/bench.js +0 -29
  781. data/ext/sources/examples/server.py +0 -120
  782. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  783. data/ext/sources/examples/stream/stream.cpp +0 -437
  784. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  785. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  786. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  787. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  788. data/ext/sources/examples/sycl/build.sh +0 -22
  789. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  790. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  791. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -47
  792. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  793. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -494
  794. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -88
  795. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2559
  796. data/ext/sources/examples/talk-llama/llama-arch.h +0 -586
  797. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -917
  798. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  799. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -876
  800. data/ext/sources/examples/talk-llama/llama-chat.h +0 -70
  801. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3645
  802. data/ext/sources/examples/talk-llama/llama-context.h +0 -360
  803. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  804. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -42
  805. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  806. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  807. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2282
  808. data/ext/sources/examples/talk-llama/llama-graph.h +0 -910
  809. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -241
  810. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -284
  811. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  812. data/ext/sources/examples/talk-llama/llama-impl.h +0 -63
  813. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  814. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  815. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -328
  816. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  817. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2100
  818. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -390
  819. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  820. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  821. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  822. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1167
  823. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  824. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  825. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  826. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -735
  827. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  828. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1247
  829. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -176
  830. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -285
  831. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -37
  832. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -8338
  833. data/ext/sources/examples/talk-llama/llama-model.h +0 -544
  834. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1072
  835. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  836. data/ext/sources/examples/talk-llama/llama-sampling.cpp +0 -3771
  837. data/ext/sources/examples/talk-llama/llama-sampling.h +0 -44
  838. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3900
  839. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -182
  840. data/ext/sources/examples/talk-llama/llama.cpp +0 -1140
  841. data/ext/sources/examples/talk-llama/llama.h +0 -1540
  842. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -191
  843. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  844. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  845. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -138
  846. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  847. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -122
  848. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -144
  849. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -135
  850. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -178
  851. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -160
  852. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  853. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  854. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  855. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  856. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  857. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  858. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  859. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -123
  860. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  861. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -144
  862. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -259
  863. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -134
  864. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  865. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -150
  866. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  867. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  868. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  869. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -113
  870. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  871. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  872. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  873. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  874. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  875. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  876. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  877. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -150
  878. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  879. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  880. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -196
  881. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -211
  882. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +0 -283
  883. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  884. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -141
  885. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  886. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -154
  887. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  888. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  889. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  890. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -175
  891. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  892. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  893. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  894. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -168
  895. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  896. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -55
  897. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -123
  898. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -199
  899. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -124
  900. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  901. data/ext/sources/examples/talk-llama/models/models.h +0 -569
  902. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -116
  903. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  904. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -150
  905. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  906. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  907. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  908. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  909. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  910. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  911. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  912. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  913. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  914. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  915. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  916. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  917. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -316
  918. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  919. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -168
  920. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  921. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  922. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  923. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  924. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -117
  925. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -124
  926. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -873
  927. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -149
  928. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -141
  929. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  930. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  931. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -162
  932. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  933. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  934. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -135
  935. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  936. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  937. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  938. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  939. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  940. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  941. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  942. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  943. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  944. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  945. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  946. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  947. data/ext/sources/examples/talk-llama/speak +0 -40
  948. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  949. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  950. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  951. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  952. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  953. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1147
  954. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  955. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  956. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  957. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  958. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  959. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  960. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  961. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  962. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  963. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  964. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  965. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  966. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  967. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  968. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  969. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  970. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +0 -157
  971. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -165
  972. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  973. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  974. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  975. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  976. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  977. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  978. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  979. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  980. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  981. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  982. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  983. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  984. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -147
  985. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  986. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +0 -907
  987. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +0 -247
  988. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  989. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  990. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  991. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
  992. data/ext/sources/tests/CMakeLists.txt +0 -112
  993. data/ext/sources/tests/earnings21/eval.mk +0 -58
  994. data/ext/sources/tests/earnings21/eval.py +0 -68
  995. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  996. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  997. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  998. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  999. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  1000. data/ext/sources/tests/en-0-ref.txt +0 -1
  1001. data/ext/sources/tests/en-1-ref.txt +0 -1
  1002. data/ext/sources/tests/en-2-ref.txt +0 -1
  1003. data/ext/sources/tests/es-0-ref.txt +0 -1
  1004. data/ext/sources/tests/librispeech/eval.mk +0 -39
  1005. data/ext/sources/tests/librispeech/eval.py +0 -47
  1006. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  1007. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  1008. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  1009. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  1010. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  1011. data/ext/sources/tests/run-tests.sh +0 -130
  1012. data/ext/sources/tests/test-c.c +0 -3
  1013. data/ext/sources/tests/test-vad-full.cpp +0 -56
  1014. data/ext/sources/tests/test-vad.cpp +0 -83
  1015. data/ext/sources/tests/test-whisper.js +0 -58
  1016. data/lib/whisper/context.rb +0 -15
  1017. data/lib/whisper/segment.rb +0 -58
@@ -28,6 +28,7 @@
28
28
  #define QK8_0 32
29
29
  #define QR8_0 1
30
30
  #define QK_K 256
31
+ #define K_SCALE_SIZE (3 * QK_K / 64)
31
32
  #define K_QUANTS_PER_ITERATION 2
32
33
 
33
34
  typedef char int8_t;
@@ -46,6 +47,118 @@ struct block_q4_0
46
47
  uint8_t qs[QK4_0 / 2];
47
48
  };
48
49
 
50
+ //------------------------------------------------------------------------------
51
+ // block_q4_1
52
+ //------------------------------------------------------------------------------
53
+ struct block_q4_1 {
54
+ half d; // delta
55
+ half m; // min
56
+ uchar qs[QK4_1 / 2]; // nibbles / quants
57
+ };
58
+
59
+ //------------------------------------------------------------------------------
60
+ // block_q5_0
61
+ //------------------------------------------------------------------------------
62
+ struct block_q5_0 {
63
+ half d; // delta
64
+ uchar qh[4]; // 5-th bit of quants
65
+ uchar qs[QK5_0 / 2]; // nibbles / quants
66
+ };
67
+
68
+ //------------------------------------------------------------------------------
69
+ // block_q5_1
70
+ //------------------------------------------------------------------------------
71
+ struct block_q5_1 {
72
+ half d; // delta
73
+ half m; // min
74
+ uchar qh[4]; // 5-th bit of quants
75
+ uchar qs[QK5_1 / 2]; // nibbles / quants
76
+ };
77
+
78
+ //------------------------------------------------------------------------------
79
+ // block_q4_k
80
+ //------------------------------------------------------------------------------
81
+ struct block_q4_K {
82
+ half d; // delta
83
+ half dm; // min
84
+ uchar s[K_SCALE_SIZE];
85
+ uchar q[QK_K / 2]; // nibbles / quants
86
+ };
87
+
88
+ //------------------------------------------------------------------------------
89
+ // block_q5_k
90
+ //------------------------------------------------------------------------------
91
+ struct block_q5_K {
92
+ half d; // delta
93
+ half dm; // min
94
+ uchar s[K_SCALE_SIZE];
95
+ uchar qh[QK_K / 8];
96
+ uchar qs[QK_K / 2]; // nibbles / quants
97
+ };
98
+
99
+ //------------------------------------------------------------------------------
100
+ // block_q6_K
101
+ //------------------------------------------------------------------------------
102
+ struct block_q6_K {
103
+ uint8_t ql[QK_K/2]; // quants, lower 4 bits
104
+ uint8_t qh[QK_K/4]; // quants, upper 2 bits
105
+ int8_t scales[QK_K/16]; // scales, quantized with 8 bits
106
+ half d; // super-block scale
107
+ };
108
+
109
+ //------------------------------------------------------------------------------
110
+ // block_iq4_nl
111
+ //------------------------------------------------------------------------------
112
+ #define QK4_NL 32
113
+
114
+ struct block_iq4_nl
115
+ {
116
+ half d;
117
+ uint8_t qs[QK4_NL / 2];
118
+ };
119
+
120
+ //------------------------------------------------------------------------------
121
+ // bf16 to f16
122
+ //------------------------------------------------------------------------------
123
+ kernel void kernel_convert_bf16_to_f16(
124
+ global const ushort * src,
125
+ global half * dst,
126
+ ulong off_dst,
127
+ ulong n
128
+ ) {
129
+ uint i = get_global_id(0);
130
+ if (i >= n) {
131
+ return;
132
+ }
133
+
134
+ dst[i + off_dst] = (half) as_float((uint) src[i] << 16);
135
+ }
136
+
137
+ //------------------------------------------------------------------------------
138
+ // f16 to bf16
139
+ //------------------------------------------------------------------------------
140
+ kernel void kernel_convert_f16_to_bf16(
141
+ global const half * src,
142
+ ulong off_src,
143
+ global ushort * dst,
144
+ ulong n
145
+ ) {
146
+ uint i = get_global_id(0);
147
+ if (i >= n) {
148
+ return;
149
+ }
150
+
151
+ float f = (float) src[i + off_src];
152
+ uint bits = as_uint(f);
153
+ if ((bits & 0x7fffffffu) > 0x7f800000u) {
154
+ // nan to quiet nan
155
+ dst[i] = (ushort)((bits >> 16) | 0x40u);
156
+ } else {
157
+ uint rounded = bits + 0x7fffu + ((bits >> 16) & 1u);
158
+ dst[i] = (ushort)(rounded >> 16);
159
+ }
160
+ }
161
+
49
162
  //------------------------------------------------------------------------------
50
163
  // kernel_convert_block_q4_0
51
164
  // Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
@@ -138,76 +251,248 @@ kernel void kernel_restore_block_q4_0_noshuffle(
138
251
  }
139
252
  }
140
253
 
141
- //------------------------------------------------------------------------------
142
- // block_mxfp4
143
- //------------------------------------------------------------------------------
144
- #define QK_MXFP4 32
145
- struct block_mxfp4 {
146
- uchar e; // E8M0
147
- uchar qs[QK_MXFP4 / 2];
148
- };
254
+ kernel void kernel_convert_block_q4_0_trans4_ns(
255
+ global struct block_q4_0 * src0,
256
+ __global uint * dst_q,
257
+ __global half * dst_d,
258
+ uint ne00,
259
+ uint ne01
260
+ ) {
261
+ uint i00 = get_global_id(1);
262
+ uint i01 = get_global_id(0);
263
+ uint i02 = get_global_id(2);
264
+
265
+ if (i01 >= ne01) {
266
+ return;
267
+ }
268
+
269
+ uint ne00_blk = ne00 / QK4_0;
270
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
271
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
272
+
273
+ global struct block_q4_0 * b = src0 + src_blk_offset;
274
+ dst_d[dst_blk_offset] = b->d;
275
+
276
+ // extract quantization and unshuffle
277
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
278
+
279
+ ushort8 post_block = (ushort8)(0);
280
+
281
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
282
+ uchar * post_block_ptr = (uchar *)(&post_block);
283
+
284
+ for (int i = 0; i < QK4_0 / 4; ++i) {
285
+ uchar x0 = pre_block_ptr[2*i + 0];
286
+ uchar x1 = pre_block_ptr[2*i + 1];
287
+
288
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
289
+ post_block_ptr[i + QK4_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
290
+ }
291
+
292
+ uint4 q_block = as_uint4(post_block);
293
+
294
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
295
+ dst_q[offset] = q_block.x;
296
+ dst_q[offset + ne01] = q_block.y;
297
+ dst_q[offset + ne01 * 2] = q_block.z;
298
+ dst_q[offset + ne01 * 3] = q_block.w;
299
+ }
300
+
301
+ kernel void kernel_restore_block_q4_0_trans4_ns(
302
+ __global uint * src_q,
303
+ __global half * src_d,
304
+ __global struct block_q4_0 * dst0,
305
+ uint ne00,
306
+ uint ne01
307
+ ) {
308
+ uint i00 = get_global_id(1);
309
+ uint i01 = get_global_id(0);
310
+ uint i02 = get_global_id(2);
311
+
312
+ if (i01 >= ne01) {
313
+ return;
314
+ }
315
+
316
+ uint ne00_blk = ne00 / QK4_0;
317
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
318
+ uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
319
+
320
+ __global struct block_q4_0 * b = dst0 + dst_blk_offset;
321
+ b->d = src_d[src_d_offset];
322
+
323
+ // collect transposed quantization parts for a block
324
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
325
+ uint4 q_block;
326
+ q_block.x = src_q[src_q_offset];
327
+ q_block.y = src_q[src_q_offset + ne01];
328
+ q_block.z = src_q[src_q_offset + ne01 * 2];
329
+ q_block.w = src_q[src_q_offset + ne01 * 3];
330
+
331
+ ushort8 post_block = as_ushort8(q_block);
332
+ ushort8 pre_block = (ushort8)(0);
333
+
334
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
335
+ uchar * post_block_ptr = (uchar *)(&post_block);
336
+
337
+ for (int i = 0; i < QK4_0 / 4; ++i) {
338
+ uchar x0 = post_block_ptr[i + 0];
339
+ uchar x1 = post_block_ptr[i + QK4_0 / 4];
340
+
341
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
342
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
343
+ }
344
+
345
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
346
+ }
149
347
 
150
348
  //------------------------------------------------------------------------------
151
- // kernel_convert_block_mxfp4
152
- // Convert the block_mxfp4 format to 2 separate arrays (AOS -> SOA).
349
+ // kernel_convert_block_q4_1
350
+ // Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
153
351
  // This kernel does not deshuffle the bits.
154
352
  //------------------------------------------------------------------------------
155
- kernel void kernel_convert_block_mxfp4(
156
- global struct block_mxfp4 * src0,
353
+ kernel void kernel_convert_block_q4_1(
354
+ global struct block_q4_1 * src0,
157
355
  global uchar * dst_q,
158
- global uchar * dst_e
356
+ global half * dst_d,
357
+ global half * dst_m
159
358
  ) {
160
- global struct block_mxfp4 * b = (global struct block_mxfp4 *) src0 + get_global_id(0);
161
- global uchar * q = (global uchar *) dst_q + QK_MXFP4 / 2 * get_global_id(0);
162
- global uchar * e = (global uchar *) dst_e + get_global_id(0);
359
+ global struct block_q4_1 * b = (global struct block_q4_1 *) src0 + get_global_id(0);
360
+ global uchar * q = (global uchar *) dst_q + QK4_1/2*get_global_id(0);
361
+ global half * d = (global half *) dst_d + get_global_id(0);
362
+ global half * m = (global half *) dst_m + get_global_id(0);
163
363
 
164
- *e = b->e;
364
+ *d = b->d;
365
+ *m = b->m;
165
366
 
166
- for (int i = 0; i < QK_MXFP4 / 2; ++i) {
367
+ for (int i = 0; i < QK4_1/2; ++i) {
167
368
  q[i] = b->qs[i];
168
369
  }
169
370
  }
170
371
 
171
- kernel void kernel_convert_block_mxfp4_trans(
172
- global struct block_mxfp4 * src0,
173
- __global uint4 * dst_q,
174
- __global uchar * dst_e,
372
+ kernel void kernel_restore_block_q4_1(
373
+ global uchar * src_q,
374
+ global half * src_d,
375
+ global half * src_m,
376
+ global struct block_q4_1 * dst
377
+ ) {
378
+ global struct block_q4_1 * b = (global struct block_q4_1 *) dst + get_global_id(0);
379
+ global uchar * q = (global uchar *) src_q + QK4_1/2*get_global_id(0);
380
+ global half * d = (global half *) src_d + get_global_id(0);
381
+ global half * m = (global half *) src_m + get_global_id(0);
382
+
383
+ b->d = *d;
384
+ b->m = *m;
385
+ for (int i = 0; i < QK4_1/2; ++i) {
386
+ b->qs[i] = q[i];
387
+ }
388
+ }
389
+
390
+ kernel void kernel_convert_block_q4_1_noshuffle(
391
+ global struct block_q4_1 * src0,
392
+ global uchar * dst_q,
393
+ global half * dst_d,
394
+ global half * dst_m
395
+ ) {
396
+ global struct block_q4_1 * b = (global struct block_q4_1 *) src0 + get_global_id(0);
397
+ global uchar * q = (global uchar *) dst_q + QK4_1/2*get_global_id(0);
398
+ global half * d = (global half *) dst_d + get_global_id(0);
399
+ global half * m = (global half *) dst_m + get_global_id(0);
400
+
401
+ *d = b->d;
402
+ *m = b->m;
403
+ for (int i = 0; i < QK4_1/4; ++i) {
404
+ uchar x0 = b->qs[2*i + 0];
405
+ uchar x1 = b->qs[2*i + 1];
406
+
407
+ q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
408
+ q[i + QK4_1/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
409
+
410
+ #ifdef ADRENO_GPU
411
+ if (get_global_id(0) == 65536*4096) {
412
+ printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
413
+ }
414
+ #endif
415
+ }
416
+ }
417
+
418
+ kernel void kernel_restore_block_q4_1_noshuffle(
419
+ global uchar * src_q,
420
+ global half * src_d,
421
+ global half * src_m,
422
+ global struct block_q4_1 * dst,
423
+ uchar mask_0F,
424
+ uchar mask_F0
425
+ ) {
426
+ global struct block_q4_1 * b = (global struct block_q4_1 *) dst + get_global_id(0);
427
+ global uchar * q = (global uchar *) src_q + QK4_1/2*get_global_id(0);
428
+ global half * d = (global half *) src_d + get_global_id(0);
429
+ global half * m = (global half *) src_m + get_global_id(0);
430
+
431
+ b->d = *d;
432
+ b->m = *m;
433
+ for (int i = 0; i < QK4_1/4; ++i) {
434
+ uchar x0 = q[i + 0 ] ;
435
+ uchar x1 = q[i + QK4_1/4];
436
+
437
+ b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
438
+ b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
439
+ }
440
+ }
441
+
442
+ kernel void kernel_convert_block_q4_1_trans4_ns(
443
+ __global struct block_q4_1 * src0,
444
+ __global uint * dst_q,
445
+ __global half * dst_d,
446
+ __global half * dst_m,
175
447
  uint ne00,
176
448
  uint ne01
177
449
  ) {
178
- int i00 = get_global_id(1);
450
+ uint i00 = get_global_id(1);
179
451
  uint i01 = get_global_id(0);
180
452
  uint i02 = get_global_id(2);
181
453
 
182
- uint ne00_blk = ne00 / QK_MXFP4;
454
+ if (i01 >= ne01) {
455
+ return;
456
+ }
457
+
458
+ uint ne00_blk = ne00 / QK4_1;
183
459
  uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
184
460
  uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
185
461
 
186
- global struct block_mxfp4 * b = src0 + src_blk_offset;
462
+ global struct block_q4_1 * b = src0 + src_blk_offset;
463
+ dst_d[dst_blk_offset] = b->d;
464
+ dst_m[dst_blk_offset] = b->m;
187
465
 
188
- dst_q[dst_blk_offset] = ((global uint4 *)(&(b->qs[0])))[0];
189
- dst_e[dst_blk_offset] = b->e;
190
- }
466
+ // extract quantization and unshuffle
467
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
191
468
 
192
- kernel void kernel_restore_block_mxfp4(
193
- global uchar * src_q,
194
- global half * src_e,
195
- global struct block_mxfp4 * dst
196
- ) {
197
- global struct block_mxfp4 * b = (global struct block_mxfp4 *) dst + get_global_id(0);
198
- global uchar * q = (global uchar *) src_q + QK_MXFP4 / 2 * get_global_id(0);
199
- global uchar * e = (global uchar *) src_e + get_global_id(0);
469
+ ushort8 post_block = (ushort8)(0);
200
470
 
201
- b->e = *e;
202
- for (int i = 0; i < QK_MXFP4 / 2; ++i) {
203
- b->qs[i] = q[i];
471
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
472
+ uchar * post_block_ptr = (uchar *)(&post_block);
473
+
474
+ for (int i = 0; i < QK4_1 / 4; ++i) {
475
+ uchar x0 = pre_block_ptr[2*i + 0];
476
+ uchar x1 = pre_block_ptr[2*i + 1];
477
+
478
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
479
+ post_block_ptr[i + QK4_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
204
480
  }
481
+
482
+ uint4 q_block = as_uint4(post_block);
483
+
484
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
485
+ dst_q[offset] = q_block.x;
486
+ dst_q[offset + ne01] = q_block.y;
487
+ dst_q[offset + ne01 * 2] = q_block.z;
488
+ dst_q[offset + ne01 * 3] = q_block.w;
205
489
  }
206
490
 
207
- kernel void kernel_restore_block_mxfp4_trans(
208
- __global uint4 * src_q,
209
- __global uchar * src_e,
210
- global struct block_mxfp4 * dst,
491
+ kernel void kernel_restore_block_q4_1_trans4_ns(
492
+ __global uint * src_q,
493
+ __global half * src_d,
494
+ __global half * src_m,
495
+ __global struct block_q4_1 * dst0,
211
496
  uint ne00,
212
497
  uint ne01
213
498
  ) {
@@ -215,51 +500,1677 @@ kernel void kernel_restore_block_mxfp4_trans(
215
500
  uint i01 = get_global_id(0);
216
501
  uint i02 = get_global_id(2);
217
502
 
218
- uint ne00_blk = ne00 / QK_MXFP4;
219
- uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
503
+ if (i01 >= ne01) {
504
+ return;
505
+ }
506
+
507
+ uint ne00_blk = ne00 / QK4_1;
220
508
  uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
509
+ uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
221
510
 
222
- global struct block_mxfp4 * b = dst + dst_blk_offset;
511
+ __global struct block_q4_1 * b = dst0 + dst_blk_offset;
512
+ b->d = src_d[src_dm_offset];
513
+ b->m = src_m[src_dm_offset];
223
514
 
224
- ((global uint4 *)(&(b->qs[0])))[0] = src_q[src_blk_offset];
225
- b->e = src_e[src_blk_offset];
515
+ // collect transposed quantization parts for a block
516
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
517
+ uint4 q_block;
518
+ q_block.x = src_q[src_q_offset];
519
+ q_block.y = src_q[src_q_offset + ne01];
520
+ q_block.z = src_q[src_q_offset + ne01 * 2];
521
+ q_block.w = src_q[src_q_offset + ne01 * 3];
522
+
523
+ ushort8 post_block = as_ushort8(q_block);
524
+ ushort8 pre_block = (ushort8)(0);
525
+
526
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
527
+ uchar * post_block_ptr = (uchar *)(&post_block);
528
+
529
+ for (int i = 0; i < QK4_0 / 4; ++i) {
530
+ uchar x0 = post_block_ptr[i + 0];
531
+ uchar x1 = post_block_ptr[i + QK4_0 / 4];
532
+
533
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
534
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
535
+ }
536
+
537
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
226
538
  }
227
539
 
228
540
  //------------------------------------------------------------------------------
229
- // block_q8_0
541
+ // kernel_convert_block_q5_0
542
+ // Convert the block_q5_0 format to 3 separate arrays (AOS -> SOA).
543
+ // This kernel does not deshuffle the bits.
230
544
  //------------------------------------------------------------------------------
231
- typedef struct {
232
- half d; // delta
233
- char qs[QK8_0]; // quants
234
- } block_q8_0;
545
+ kernel void kernel_convert_block_q5_0(
546
+ global struct block_q5_0 * src0,
547
+ global uchar * dst_qs,
548
+ global uint * dst_qh,
549
+ global half * dst_d,
550
+ ulong n_blk
551
+ ) {
552
+ if (get_global_id(0) >= n_blk) {
553
+ return;
554
+ }
235
555
 
236
- kernel void kernel_convert_block_q8_0(
237
- global block_q8_0 * src0,
556
+ global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
557
+ global uchar * qs = (global uchar *) dst_qs + (QK5_0/2)*get_global_id(0);
558
+ global uint * qh = (global uint *) dst_qh + get_global_id(0);
559
+ global half * d = (global half *) dst_d + get_global_id(0);
560
+
561
+ *d = b->d;
562
+ *qh = *((global uint *)(b->qh));
563
+
564
+ for (int i = 0; i < QK5_0/2; ++i) {
565
+ qs[i] = b->qs[i];
566
+ }
567
+ }
568
+
569
+ kernel void kernel_restore_block_q5_0(
570
+ global uchar * src_qs,
571
+ global uint * src_qh,
572
+ global half * src_d,
573
+ global struct block_q5_0 * dst
574
+ ) {
575
+ global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
576
+ global uchar * qs = (global uchar *) src_qs + (QK5_0/2)*get_global_id(0);
577
+ global uint * qh = (global uint *) src_qh + get_global_id(0);
578
+ global half * d = (global half *) src_d + get_global_id(0);
579
+
580
+ b->d = *d;
581
+ *((global uint *)(b->qh)) = *qh;
582
+ for (int i = 0; i < QK5_0/2; ++i) {
583
+ b->qs[i] = qs[i];
584
+ }
585
+ }
586
+
587
+ kernel void kernel_convert_block_q5_0_noshuffle(
588
+ global struct block_q5_0 * src0,
238
589
  global uchar * dst_q,
590
+ global uint * dst_qh,
239
591
  global half * dst_d
240
592
  ) {
241
- global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
242
- global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
243
- global half * d = (global half *) dst_d + get_global_id(0);
593
+ global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
594
+ global uchar * q = (global uchar *) dst_q + QK5_0/2*get_global_id(0);
595
+ global uint * qh = (global uint *) dst_qh + get_global_id(0);
596
+ global half * d = (global half *) dst_d + get_global_id(0);
244
597
 
245
598
  *d = b->d;
599
+ *qh = *((global uint *)(b->qh));
246
600
 
247
- for (int i = 0; i < QK8_0; ++i) {
248
- q[i] = b->qs[i];
601
+ for (int i = 0; i < QK5_0/4; ++i) {
602
+ uchar x0 = b->qs[2*i + 0];
603
+ uchar x1 = b->qs[2*i + 1];
604
+
605
+ q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
606
+ q[i + QK5_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
607
+
608
+ #ifdef ADRENO_GPU
609
+ if (get_global_id(0) == 65536*4096) {
610
+ printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
611
+ }
612
+ #endif
249
613
  }
250
614
  }
251
615
 
252
- kernel void kernel_restore_block_q8_0(
616
+ kernel void kernel_restore_block_q5_0_noshuffle(
253
617
  global uchar * src_q,
618
+ global uint * src_qh,
254
619
  global half * src_d,
255
- global block_q8_0 * dst
620
+ global struct block_q5_0 * dst,
621
+ uchar mask_0F,
622
+ uchar mask_F0
256
623
  ) {
257
- global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
258
- global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
259
- global half * d = (global half *) src_d + get_global_id(0);
624
+ global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
625
+ global uchar * q = (global uchar *) src_q + QK5_0/2*get_global_id(0);
626
+ global uint * qh = (global uint *) src_qh + get_global_id(0);
627
+ global half * d = (global half *) src_d + get_global_id(0);
260
628
 
261
629
  b->d = *d;
262
- for (int i = 0; i < QK8_0; ++i) {
263
- b->qs[i] = q[i];
630
+ *((global uint *)(b->qh)) = *qh;
631
+
632
+ for (int i = 0; i < QK5_0/4; ++i) {
633
+ uchar x0 = q[i + 0 ];
634
+ uchar x1 = q[i + QK5_0/4];
635
+
636
+ b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
637
+ b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
638
+ }
639
+ }
640
+
641
+ kernel void kernel_convert_block_q5_0_trans4_ns(
642
+ __global struct block_q5_0 * src0,
643
+ __global uint * dst_qs,
644
+ __global uint * dst_qh,
645
+ __global half * dst_d,
646
+ uint ne00,
647
+ uint ne01
648
+ ) {
649
+ uint i00 = get_global_id(1);
650
+ uint i01 = get_global_id(0);
651
+ uint i02 = get_global_id(2);
652
+
653
+ if (i01 >= ne01) {
654
+ return;
655
+ }
656
+
657
+ uint ne00_blk = ne00 / QK5_0;
658
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
659
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
660
+
661
+ global struct block_q5_0 * b = src0 + src_blk_offset;
662
+ dst_d[dst_blk_offset] = b->d;
663
+
664
+ dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
665
+
666
+ // extract quantization and unshuffle
667
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
668
+ ushort8 post_block = (ushort8)(0);
669
+
670
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
671
+ uchar * post_block_ptr = (uchar *)(&post_block);
672
+
673
+ for (int i = 0; i < QK5_0 / 4; ++i) {
674
+ uchar x0 = pre_block_ptr[2*i + 0];
675
+ uchar x1 = pre_block_ptr[2*i + 1];
676
+
677
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
678
+ post_block_ptr[i + QK5_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
679
+ }
680
+
681
+ uint4 q_block = as_uint4(post_block);
682
+
683
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
684
+ dst_qs[offset] = q_block.x;
685
+ dst_qs[offset + ne01] = q_block.y;
686
+ dst_qs[offset + ne01 * 2] = q_block.z;
687
+ dst_qs[offset + ne01 * 3] = q_block.w;
688
+ }
689
+
690
+ kernel void kernel_restore_block_q5_0_trans4_ns(
691
+ __global uint * src_qs,
692
+ __global uint * src_qh,
693
+ __global half * src_d,
694
+ __global struct block_q5_0 * dst0,
695
+ uint ne00,
696
+ uint ne01
697
+ ) {
698
+ int i00 = get_global_id(1);
699
+ uint i01 = get_global_id(0);
700
+ uint i02 = get_global_id(2);
701
+
702
+ if (i01 >= ne01) {
703
+ return;
704
+ }
705
+
706
+ uint ne00_blk = ne00 / QK5_0;
707
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
708
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
709
+
710
+ __global struct block_q5_0 * b = dst0 + dst_blk_offset;
711
+ b->d = src_d[src_blk_offset];
712
+
713
+ ((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
714
+
715
+ // collect transposed quantization parts for a block
716
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
717
+ uint4 q_block;
718
+ q_block.x = src_qs[src_q_offset];
719
+ q_block.y = src_qs[src_q_offset + ne01];
720
+ q_block.z = src_qs[src_q_offset + ne01 * 2];
721
+ q_block.w = src_qs[src_q_offset + ne01 * 3];
722
+
723
+ ushort8 post_block = as_ushort8(q_block);
724
+ ushort8 pre_block = (ushort8)(0);
725
+
726
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
727
+ uchar * post_block_ptr = (uchar *)(&post_block);
728
+
729
+ for (int i = 0; i < QK5_0 / 4; ++i) {
730
+ uchar x0 = post_block_ptr[i + 0];
731
+ uchar x1 = post_block_ptr[i + QK5_0 / 4];
732
+
733
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
734
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
735
+ }
736
+
737
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
738
+ }
739
+
740
+ //------------------------------------------------------------------------------
741
+ // kernel_convert_block_q5_1
742
+ // Convert the block_q5_1 format to 4 separate arrays (AOS -> SOA).
743
+ // This kernel does not deshuffle the bits.
744
+ //------------------------------------------------------------------------------
745
+ kernel void kernel_convert_block_q5_1(
746
+ global struct block_q5_1 * src0,
747
+ global uchar * dst_qs,
748
+ global uint * dst_qh,
749
+ global half * dst_d,
750
+ global half * dst_m,
751
+ ulong n_blk
752
+ ) {
753
+ if (get_global_id(0) >= n_blk) {
754
+ return;
755
+ }
756
+
757
+ global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
758
+ global uchar * qs = (global uchar *) dst_qs + (QK5_1/2)*get_global_id(0);
759
+ global uint * qh = (global uint *) dst_qh + get_global_id(0);
760
+ global half * d = (global half *) dst_d + get_global_id(0);
761
+ global half * m = (global half *) dst_m + get_global_id(0);
762
+
763
+ *d = b->d;
764
+ *m = b->m;
765
+ *qh = *((global uint *)(b->qh));
766
+
767
+ for (int i = 0; i < QK5_1/2; ++i) {
768
+ qs[i] = b->qs[i];
769
+ }
770
+ }
771
+
772
+ kernel void kernel_restore_block_q5_1(
773
+ global uchar * src_qs,
774
+ global uint * src_qh,
775
+ global half * src_d,
776
+ global half * src_m,
777
+ global struct block_q5_1 * dst
778
+ ) {
779
+ global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
780
+ global uchar * qs = (global uchar *) src_qs + (QK5_1/2)*get_global_id(0);
781
+ global uint * qh = (global uint *) src_qh + get_global_id(0);
782
+ global half * d = (global half *) src_d + get_global_id(0);
783
+ global half * m = (global half *) src_m + get_global_id(0);
784
+
785
+ b->d = *d;
786
+ b->m = *m;
787
+ *((global uint *)(b->qh)) = *qh;
788
+ for (int i = 0; i < QK5_1/2; ++i) {
789
+ b->qs[i] = qs[i];
790
+ }
791
+ }
792
+
793
+ kernel void kernel_convert_block_q5_1_noshuffle(
794
+ global struct block_q5_1 * src0,
795
+ global uchar * dst_q,
796
+ global uint * dst_qh,
797
+ global half * dst_d,
798
+ global half * dst_m
799
+ ) {
800
+ global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
801
+ global uchar * q = (global uchar *) dst_q + QK5_1/2*get_global_id(0);
802
+ global uint * qh = (global uint *) dst_qh + get_global_id(0);
803
+ global half * d = (global half *) dst_d + get_global_id(0);
804
+ global half * m = (global half *) dst_m + get_global_id(0);
805
+
806
+ *d = b->d;
807
+ *m = b->m;
808
+ *qh = *((global uint *)(b->qh));
809
+
810
+ for (int i = 0; i < QK5_1/4; ++i) {
811
+ uchar x0 = b->qs[2*i + 0];
812
+ uchar x1 = b->qs[2*i + 1];
813
+
814
+ q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
815
+ q[i + QK5_1/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
816
+
817
+ #ifdef ADRENO_GPU
818
+ if (get_global_id(0) == 65536*4096) {
819
+ printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
820
+ }
821
+ #endif
822
+ }
823
+ }
824
+
825
+ kernel void kernel_restore_block_q5_1_noshuffle(
826
+ global uchar * src_q,
827
+ global uint * src_qh,
828
+ global half * src_d,
829
+ global half * src_m,
830
+ global struct block_q5_1 * dst,
831
+ uchar mask_0F,
832
+ uchar mask_F0
833
+ ) {
834
+ global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
835
+ global uchar * q = (global uchar *) src_q + QK5_1/2*get_global_id(0);
836
+ global uint * qh = (global uint *) src_qh + get_global_id(0);
837
+ global half * d = (global half *) src_d + get_global_id(0);
838
+ global half * m = (global half *) src_m + get_global_id(0);
839
+
840
+ b->d = *d;
841
+ b->m = *m;
842
+ *((global uint *)(b->qh)) = *qh;
843
+
844
+ for (int i = 0; i < QK5_1/4; ++i) {
845
+ uchar x0 = q[i + 0 ];
846
+ uchar x1 = q[i + QK5_1/4];
847
+
848
+ b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
849
+ b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
850
+ }
851
+ }
852
+
853
+ kernel void kernel_convert_block_q5_1_trans4_ns(
854
+ __global struct block_q5_1 * src0,
855
+ __global uint * dst_qs,
856
+ __global uint * dst_qh,
857
+ __global half * dst_d,
858
+ __global half * dst_m,
859
+ uint ne00,
860
+ uint ne01
861
+ ) {
862
+ uint i00 = get_global_id(1);
863
+ uint i01 = get_global_id(0);
864
+ uint i02 = get_global_id(2);
865
+
866
+ if (i01 >= ne01) {
867
+ return;
868
+ }
869
+
870
+ uint ne00_blk = ne00 / QK5_1;
871
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
872
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
873
+
874
+ global struct block_q5_1 * b = src0 + src_blk_offset;
875
+ dst_d[dst_blk_offset] = b->d;
876
+ dst_m[dst_blk_offset] = b->m;
877
+
878
+ dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
879
+
880
+ // extract quantization and unshuffle
881
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
882
+ ushort8 post_block = (ushort8)(0);
883
+
884
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
885
+ uchar * post_block_ptr = (uchar *)(&post_block);
886
+
887
+ for (int i = 0; i < QK5_1 / 4; ++i) {
888
+ uchar x0 = pre_block_ptr[2*i + 0];
889
+ uchar x1 = pre_block_ptr[2*i + 1];
890
+
891
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
892
+ post_block_ptr[i + QK5_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
893
+ }
894
+
895
+ uint4 q_block = as_uint4(post_block);
896
+
897
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
898
+ dst_qs[offset] = q_block.x;
899
+ dst_qs[offset + ne01] = q_block.y;
900
+ dst_qs[offset + ne01 * 2] = q_block.z;
901
+ dst_qs[offset + ne01 * 3] = q_block.w;
902
+ }
903
+
904
+ kernel void kernel_restore_block_q5_1_trans4_ns(
905
+ __global uint * src_qs,
906
+ __global uint * src_qh,
907
+ __global half * src_d,
908
+ __global half * src_m,
909
+ __global struct block_q5_1 * dst0,
910
+ uint ne00,
911
+ uint ne01
912
+ ) {
913
+ int i00 = get_global_id(1);
914
+ uint i01 = get_global_id(0);
915
+ uint i02 = get_global_id(2);
916
+
917
+ if (i01 >= ne01) {
918
+ return;
919
+ }
920
+
921
+ uint ne00_blk = ne00 / QK5_1;
922
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
923
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
924
+
925
+ __global struct block_q5_1 * b = dst0 + dst_blk_offset;
926
+ b->d = src_d[src_blk_offset];
927
+ b->m = src_m[src_blk_offset];
928
+
929
+ ((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
930
+
931
+ // collect transposed quantization parts for a block
932
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
933
+ uint4 q_block;
934
+ q_block.x = src_qs[src_q_offset];
935
+ q_block.y = src_qs[src_q_offset + ne01];
936
+ q_block.z = src_qs[src_q_offset + ne01 * 2];
937
+ q_block.w = src_qs[src_q_offset + ne01 * 3];
938
+
939
+ ushort8 post_block = as_ushort8(q_block);
940
+ ushort8 pre_block = (ushort8)(0);
941
+
942
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
943
+ uchar * post_block_ptr = (uchar *)(&post_block);
944
+
945
+ for (int i = 0; i < QK5_1 / 4; ++i) {
946
+ uchar x0 = post_block_ptr[i + 0];
947
+ uchar x1 = post_block_ptr[i + QK5_1 / 4];
948
+
949
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
950
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
951
+ }
952
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
953
+ }
954
+
955
+ kernel void kernel_convert_block_q4_k_trans4_ns(
956
+ __global struct block_q4_K * src0,
957
+ __global uint * dst_q,
958
+ __global half * dst_d,
959
+ __global half * dst_dm,
960
+ __global uchar * dst_s,
961
+ uint ne00,
962
+ uint ne01,
963
+ uchar mask_0F,
964
+ uchar mask_F0
965
+ ) {
966
+ uint i00 = get_global_id(1);
967
+ uint i01 = get_global_id(0);
968
+ uint i02 = get_global_id(2);
969
+
970
+ if (i01 >= ne01) {
971
+ return;
972
+ }
973
+
974
+ uint ne00_blk = ne00 / QK_K;
975
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
976
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
977
+
978
+ __global struct block_q4_K * b = src0 + src_blk_offset;
979
+
980
+ dst_d [dst_blk_offset] = b->d;
981
+ dst_dm[dst_blk_offset] = b->dm;
982
+
983
+ uint4 qv[8];
984
+ uchar * qv_bytes = (uchar *)qv;
985
+ for (int i = 0; i < QK_K / 64; ++i) {
986
+ for (int j = 0; j < 16; ++j) {
987
+ uchar x0 = b->q[i*32 + 2*j];
988
+ uchar x1 = b->q[i*32 + 2*j + 1];
989
+
990
+ qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
991
+ qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
992
+ }
993
+ }
994
+
995
+ uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
996
+ #pragma unroll
997
+ for (int p = 0; p < 8; ++p) {
998
+ uint4 v = qv[p];
999
+ dst_q[base + (p * 4 + 0) * ne01] = v.x;
1000
+ dst_q[base + (p * 4 + 1) * ne01] = v.y;
1001
+ dst_q[base + (p * 4 + 2) * ne01] = v.z;
1002
+ dst_q[base + (p * 4 + 3) * ne01] = v.w;
1003
+ }
1004
+
1005
+ __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
1006
+ #pragma unroll
1007
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1008
+ s_dst[i] = b->s[i];
1009
+ }
1010
+ }
1011
+
1012
+ kernel void kernel_restore_block_q4_k_trans4_ns(
1013
+ __global uint * src_q,
1014
+ __global half * src_d,
1015
+ __global half * src_dm,
1016
+ __global uchar * src_s,
1017
+ __global struct block_q4_K * dst0,
1018
+ uint ne00,
1019
+ uint ne01,
1020
+ uchar mask_0F,
1021
+ uchar mask_F0
1022
+ ) {
1023
+ uint i00 = get_global_id(1); // block index along K
1024
+ uint i01 = get_global_id(0); // row index
1025
+ uint i02 = get_global_id(2); // batch index
1026
+
1027
+ if (i01 >= ne01) {
1028
+ return;
1029
+ }
1030
+
1031
+ uint ne00_blk = ne00 / QK_K;
1032
+
1033
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1034
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1035
+
1036
+ __global struct block_q4_K * b = dst0 + dst_blk_offset;
1037
+
1038
+ b->d = src_d[src_blk_offset];
1039
+ b->dm = src_dm[src_blk_offset];
1040
+
1041
+ __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
1042
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1043
+ b->s[i] = s_src[i];
1044
+ }
1045
+
1046
+ uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1047
+
1048
+ uint4 qv[8];
1049
+ for (int p = 0; p < 8; ++p) {
1050
+ qv[p].x = src_q[base + (p * 4 + 0) * ne01];
1051
+ qv[p].y = src_q[base + (p * 4 + 1) * ne01];
1052
+ qv[p].z = src_q[base + (p * 4 + 2) * ne01];
1053
+ qv[p].w = src_q[base + (p * 4 + 3) * ne01];
1054
+ }
1055
+
1056
+ uchar * qv_bytes = (uchar *)qv;
1057
+ for (int i = 0; i < QK_K / 64; ++i) {
1058
+ for (int j = 0; j < 16; ++j) {
1059
+ uchar lo = qv_bytes[i*32 + j];
1060
+ uchar hi = qv_bytes[i*32 + j + 16];
1061
+ b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
1062
+ b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
1063
+ }
1064
+ }
1065
+ }
1066
+
1067
+ kernel void kernel_convert_block_q5_k_trans4_ns(
1068
+ __global struct block_q5_K * src0,
1069
+ __global uint * dst_qs,
1070
+ __global uint * dst_qh,
1071
+ __global half * dst_d,
1072
+ __global half * dst_dm,
1073
+ __global uchar * dst_s,
1074
+ uint ne00,
1075
+ uint ne01,
1076
+ uchar mask_0F,
1077
+ uchar mask_F0
1078
+ ) {
1079
+ uint i00 = get_global_id(1);
1080
+ uint i01 = get_global_id(0);
1081
+ uint i02 = get_global_id(2);
1082
+
1083
+ if (i01 >= ne01) {
1084
+ return;
1085
+ }
1086
+
1087
+ uint ne00_blk = ne00 / QK_K;
1088
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1089
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1090
+
1091
+ __global struct block_q5_K * b = src0 + src_blk_offset;
1092
+
1093
+ dst_d [dst_blk_offset] = b->d;
1094
+ dst_dm[dst_blk_offset] = b->dm;
1095
+
1096
+ for (int k = 0; k < 8; k++) {
1097
+ uchar b0 = 0, b1 = 0, b2 = 0, b3 = 0;
1098
+ for (int bit = 0; bit < 8; bit++) {
1099
+ b0 |= (uchar)(((b->qh[bit] >> k) & 1) << bit);
1100
+ b1 |= (uchar)(((b->qh[8 + bit] >> k) & 1) << bit);
1101
+ b2 |= (uchar)(((b->qh[16 + bit] >> k) & 1) << bit);
1102
+ b3 |= (uchar)(((b->qh[24 + bit] >> k) & 1) << bit);
1103
+ }
1104
+ uint packed = (uint)b0 | ((uint)b1 << 8) | ((uint)b2 << 16) | ((uint)b3 << 24);
1105
+ dst_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01] = packed;
1106
+ }
1107
+
1108
+ uint4 qv[8];
1109
+ uchar * qv_bytes = (uchar *)qv;
1110
+ for (int i = 0; i < QK_K / 64; ++i) {
1111
+ for (int j = 0; j < 16; ++j) {
1112
+ uchar x0 = b->qs[i*32 + 2*j];
1113
+ uchar x1 = b->qs[i*32 + 2*j + 1];
1114
+
1115
+ qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
1116
+ qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
1117
+ }
1118
+ }
1119
+
1120
+ uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1121
+ #pragma unroll
1122
+ for (int p = 0; p < 8; ++p) {
1123
+ uint4 v = qv[p];
1124
+ dst_qs[base + (p * 4 + 0) * ne01] = v.x;
1125
+ dst_qs[base + (p * 4 + 1) * ne01] = v.y;
1126
+ dst_qs[base + (p * 4 + 2) * ne01] = v.z;
1127
+ dst_qs[base + (p * 4 + 3) * ne01] = v.w;
1128
+ }
1129
+
1130
+ __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
1131
+ #pragma unroll
1132
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1133
+ s_dst[i] = b->s[i];
1134
+ }
1135
+ }
1136
+
1137
+ kernel void kernel_restore_block_q5_k_trans4_ns(
1138
+ __global uint * src_qs,
1139
+ __global uint * src_qh,
1140
+ __global half * src_d,
1141
+ __global half * src_dm,
1142
+ __global uchar * src_s,
1143
+ __global struct block_q5_K * dst0,
1144
+ uint ne00,
1145
+ uint ne01,
1146
+ uchar mask_0F,
1147
+ uchar mask_F0
1148
+ ) {
1149
+ uint i00 = get_global_id(1); // block index along K
1150
+ uint i01 = get_global_id(0); // row index
1151
+ uint i02 = get_global_id(2); // batch index
1152
+
1153
+ if (i01 >= ne01) {
1154
+ return;
1155
+ }
1156
+
1157
+ uint ne00_blk = ne00 / QK_K;
1158
+
1159
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1160
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1161
+
1162
+ __global struct block_q5_K * b = dst0 + dst_blk_offset;
1163
+
1164
+ b->d = src_d[src_blk_offset];
1165
+ b->dm = src_dm[src_blk_offset];
1166
+
1167
+ for (int j = 0; j < 32; j++) b->qh[j] = 0;
1168
+ for (int k = 0; k < 8; k++) {
1169
+ uint packed = src_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01];
1170
+ uchar b0 = (uchar)(packed & 0xFF);
1171
+ uchar b1 = (uchar)((packed >> 8) & 0xFF);
1172
+ uchar b2 = (uchar)((packed >> 16) & 0xFF);
1173
+ uchar b3 = (uchar)((packed >> 24) & 0xFF);
1174
+ for (int bit = 0; bit < 8; bit++) {
1175
+ b->qh[bit] |= (uchar)(((b0 >> bit) & 1) << k);
1176
+ b->qh[8 + bit] |= (uchar)(((b1 >> bit) & 1) << k);
1177
+ b->qh[16 + bit] |= (uchar)(((b2 >> bit) & 1) << k);
1178
+ b->qh[24 + bit] |= (uchar)(((b3 >> bit) & 1) << k);
1179
+ }
1180
+ }
1181
+
1182
+ __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
1183
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1184
+ b->s[i] = s_src[i];
1185
+ }
1186
+
1187
+ uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1188
+
1189
+ uint4 qv[8];
1190
+ for (int p = 0; p < 8; ++p) {
1191
+ qv[p].x = src_qs[base + (p * 4 + 0) * ne01];
1192
+ qv[p].y = src_qs[base + (p * 4 + 1) * ne01];
1193
+ qv[p].z = src_qs[base + (p * 4 + 2) * ne01];
1194
+ qv[p].w = src_qs[base + (p * 4 + 3) * ne01];
1195
+ }
1196
+
1197
+ uchar * qv_bytes = (uchar *)qv;
1198
+ for (int i = 0; i < QK_K / 64; ++i) {
1199
+ for (int j = 0; j < 16; ++j) {
1200
+ uchar lo = qv_bytes[i*32 + j];
1201
+ uchar hi = qv_bytes[i*32 + j + 16];
1202
+ b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
1203
+ b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
1204
+ }
1205
+ }
1206
+ }
1207
+
1208
+ kernel void kernel_convert_block_q6_k_trans4_ns(
1209
+ __global struct block_q6_K * src0,
1210
+ __global uint * dst_ql,
1211
+ __global uint * dst_qh,
1212
+ __global half * dst_d,
1213
+ __global char * dst_s,
1214
+ uint ne00,
1215
+ uint ne01,
1216
+ uchar mask_0F,
1217
+ uchar mask_F0
1218
+ ) {
1219
+ uint i00 = get_global_id(1);
1220
+ uint i01 = get_global_id(0);
1221
+ uint i02 = get_global_id(2);
1222
+
1223
+ if (i01 >= ne01) {
1224
+ return;
1225
+ }
1226
+
1227
+ uint ne00_blk = ne00 / QK_K;
1228
+
1229
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1230
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1231
+
1232
+ __global struct block_q6_K * b = src0 + src_blk_offset;
1233
+
1234
+ dst_d[dst_blk_offset] = b->d;
1235
+
1236
+ uint4 qlv[8];
1237
+ uchar * qlv_bytes = (uchar *)qlv;
1238
+ for (int i = 0; i < 2; ++i) {
1239
+ for (int j = 0; j < 16; ++j) {
1240
+ uchar x0 = b->ql[i*64 + 2*j];
1241
+ uchar x1 = b->ql[i*64 + 2*j + 1];
1242
+ uchar x2 = b->ql[i*64 + 32 + 2*j];
1243
+ uchar x3 = b->ql[i*64 + 32 + 2*j + 1];
1244
+ qlv_bytes[i*64 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
1245
+ qlv_bytes[i*64 + j + 16] = convert_uchar(x2 & mask_0F) | convert_uchar((x3 & mask_0F) << 4);
1246
+ qlv_bytes[i*64 + j + 32] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
1247
+ qlv_bytes[i*64 + j + 48] = convert_uchar((x2 & mask_F0) >> 4) | convert_uchar(x3 & mask_F0);
1248
+ }
1249
+ }
1250
+
1251
+ uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1252
+
1253
+ #pragma unroll
1254
+ for (int p = 0; p < 8; ++p) {
1255
+ uint4 v = qlv[p];
1256
+ dst_ql[ql_base + (p * 4 + 0) * ne01] = v.x;
1257
+ dst_ql[ql_base + (p * 4 + 1) * ne01] = v.y;
1258
+ dst_ql[ql_base + (p * 4 + 2) * ne01] = v.z;
1259
+ dst_ql[ql_base + (p * 4 + 3) * ne01] = v.w;
1260
+ }
1261
+
1262
+ uint qhv[16] = {0};
1263
+
1264
+ for (int n = 0; n < 2; ++n) {
1265
+ for (int l = 0; l < 32; ++l) {
1266
+ uchar h = b->qh[n*32 + l];
1267
+ int u = l / 16;
1268
+ int bit_pos = (l % 16) * 2;
1269
+ qhv[(n*4 + 0)*2 + u] |= ((uint)((h >> 0) & 0x03)) << bit_pos;
1270
+ qhv[(n*4 + 1)*2 + u] |= ((uint)((h >> 2) & 0x03)) << bit_pos;
1271
+ qhv[(n*4 + 2)*2 + u] |= ((uint)((h >> 4) & 0x03)) << bit_pos;
1272
+ qhv[(n*4 + 3)*2 + u] |= ((uint)((h >> 6) & 0x03)) << bit_pos;
1273
+ }
1274
+ }
1275
+
1276
+ uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01;
1277
+
1278
+ for (int p = 0; p < 16; ++p) {
1279
+ dst_qh[qh_base + p * ne01] = qhv[p];
1280
+ }
1281
+
1282
+ __global char * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16;
1283
+ #pragma unroll
1284
+ for (int i = 0; i < 16; ++i) {
1285
+ s_dst[i] = b->scales[i];
1286
+ }
1287
+ }
1288
+
1289
+ kernel void kernel_restore_block_q6_k_trans4_ns(
1290
+ __global uint * src_ql,
1291
+ __global uint * src_qh,
1292
+ __global half * src_d,
1293
+ __global char * src_s,
1294
+ __global struct block_q6_K * dst0,
1295
+ uint ne00,
1296
+ uint ne01,
1297
+ uchar mask_0F,
1298
+ uchar mask_F0
1299
+ ) {
1300
+ uint i00 = get_global_id(1); // block index along K
1301
+ uint i01 = get_global_id(0); // row index
1302
+ uint i02 = get_global_id(2); // batch index
1303
+
1304
+ if (i01 >= ne01) {
1305
+ return;
1306
+ }
1307
+
1308
+ uint ne00_blk = ne00 / QK_K;
1309
+
1310
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1311
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1312
+
1313
+ __global struct block_q6_K * b = dst0 + dst_blk_offset;
1314
+
1315
+ b->d = src_d[src_blk_offset];
1316
+
1317
+ uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1318
+ uint4 qlv[8];
1319
+ for (int p = 0; p < 8; ++p) {
1320
+ qlv[p].x = src_ql[ql_base + (p * 4 + 0) * ne01];
1321
+ qlv[p].y = src_ql[ql_base + (p * 4 + 1) * ne01];
1322
+ qlv[p].z = src_ql[ql_base + (p * 4 + 2) * ne01];
1323
+ qlv[p].w = src_ql[ql_base + (p * 4 + 3) * ne01];
1324
+ }
1325
+
1326
+ uchar * qlv_bytes = (uchar *)qlv;
1327
+ for (int i = 0; i < 2; ++i) {
1328
+ for (int j = 0; j < 16; ++j) {
1329
+ uchar lo_02 = qlv_bytes[i*64 + j];
1330
+ uchar lo_13 = qlv_bytes[i*64 + j + 16];
1331
+ uchar hi_02 = qlv_bytes[i*64 + j + 32];
1332
+ uchar hi_13 = qlv_bytes[i*64 + j + 48];
1333
+ b->ql[i*64 + 2*j] = convert_uchar((lo_02 & mask_0F) | ((hi_02 & mask_0F) << 4));
1334
+ b->ql[i*64 + 2*j + 1] = convert_uchar(((lo_02 & mask_F0) >> 4) | (hi_02 & mask_F0));
1335
+ b->ql[i*64 + 32 + 2*j] = convert_uchar((lo_13 & mask_0F) | ((hi_13 & mask_0F) << 4));
1336
+ b->ql[i*64 + 32 + 2*j + 1] = convert_uchar(((lo_13 & mask_F0) >> 4) | (hi_13 & mask_F0));
1337
+ }
1338
+ }
1339
+
1340
+ uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01;
1341
+ uint qhv[16];
1342
+ for (int p = 0; p < 16; ++p) {
1343
+ qhv[p] = src_qh[qh_base + p * ne01];
1344
+ }
1345
+
1346
+ for (int n = 0; n < 2; ++n) {
1347
+ for (int l = 0; l < 32; ++l) {
1348
+ int u = l / 16;
1349
+ int bit_pos = (l % 16) * 2;
1350
+ uchar v0 = (uchar)((qhv[(n*4 + 0)*2 + u] >> bit_pos) & 0x03);
1351
+ uchar v1 = (uchar)((qhv[(n*4 + 1)*2 + u] >> bit_pos) & 0x03);
1352
+ uchar v2 = (uchar)((qhv[(n*4 + 2)*2 + u] >> bit_pos) & 0x03);
1353
+ uchar v3 = (uchar)((qhv[(n*4 + 3)*2 + u] >> bit_pos) & 0x03);
1354
+ b->qh[n*32 + l] = v0 | (v1 << 2) | (v2 << 4) | (v3 << 6);
1355
+ }
1356
+ }
1357
+
1358
+ __global char * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16;
1359
+ for (int i = 0; i < 16; ++i) {
1360
+ b->scales[i] = s_src[i];
1361
+ }
1362
+ }
1363
+
1364
+ //------------------------------------------------------------------------------
1365
+ // block_mxfp4
1366
+ //------------------------------------------------------------------------------
1367
+ #define QK_MXFP4 32
1368
+ struct block_mxfp4 {
1369
+ uchar e; // E8M0
1370
+ uchar qs[QK_MXFP4 / 2];
1371
+ };
1372
+
1373
+ //------------------------------------------------------------------------------
1374
+ // kernel_convert_block_mxfp4
1375
+ // Convert the block_mxfp4 format to 2 separate arrays (AOS -> SOA).
1376
+ // This kernel does not deshuffle the bits.
1377
+ //------------------------------------------------------------------------------
1378
+ kernel void kernel_convert_block_mxfp4(
1379
+ global struct block_mxfp4 * src0,
1380
+ global uchar * dst_q,
1381
+ global uchar * dst_e
1382
+ ) {
1383
+ global struct block_mxfp4 * b = (global struct block_mxfp4 *) src0 + get_global_id(0);
1384
+ global uchar * q = (global uchar *) dst_q + QK_MXFP4 / 2 * get_global_id(0);
1385
+ global uchar * e = (global uchar *) dst_e + get_global_id(0);
1386
+
1387
+ *e = b->e;
1388
+
1389
+ for (int i = 0; i < QK_MXFP4 / 2; ++i) {
1390
+ q[i] = b->qs[i];
1391
+ }
1392
+ }
1393
+
1394
+ kernel void kernel_convert_block_mxfp4_trans(
1395
+ global struct block_mxfp4 * src0,
1396
+ __global uint4 * dst_q,
1397
+ __global uchar * dst_e,
1398
+ uint ne00,
1399
+ uint ne01
1400
+ ) {
1401
+ int i00 = get_global_id(1);
1402
+ uint i01 = get_global_id(0);
1403
+ uint i02 = get_global_id(2);
1404
+
1405
+ uint ne00_blk = ne00 / QK_MXFP4;
1406
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1407
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1408
+
1409
+ global struct block_mxfp4 * b = src0 + src_blk_offset;
1410
+
1411
+ dst_q[dst_blk_offset] = ((global uint4 *)(&(b->qs[0])))[0];
1412
+ dst_e[dst_blk_offset] = b->e;
1413
+ }
1414
+
1415
+ kernel void kernel_restore_block_mxfp4(
1416
+ global uchar * src_q,
1417
+ global half * src_e,
1418
+ global struct block_mxfp4 * dst
1419
+ ) {
1420
+ global struct block_mxfp4 * b = (global struct block_mxfp4 *) dst + get_global_id(0);
1421
+ global uchar * q = (global uchar *) src_q + QK_MXFP4 / 2 * get_global_id(0);
1422
+ global uchar * e = (global uchar *) src_e + get_global_id(0);
1423
+
1424
+ b->e = *e;
1425
+ for (int i = 0; i < QK_MXFP4 / 2; ++i) {
1426
+ b->qs[i] = q[i];
1427
+ }
1428
+ }
1429
+
1430
+ kernel void kernel_restore_block_mxfp4_trans(
1431
+ __global uint4 * src_q,
1432
+ __global uchar * src_e,
1433
+ global struct block_mxfp4 * dst,
1434
+ uint ne00,
1435
+ uint ne01
1436
+ ) {
1437
+ int i00 = get_global_id(1);
1438
+ uint i01 = get_global_id(0);
1439
+ uint i02 = get_global_id(2);
1440
+
1441
+ uint ne00_blk = ne00 / QK_MXFP4;
1442
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1443
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1444
+
1445
+ global struct block_mxfp4 * b = dst + dst_blk_offset;
1446
+
1447
+ ((global uint4 *)(&(b->qs[0])))[0] = src_q[src_blk_offset];
1448
+ b->e = src_e[src_blk_offset];
1449
+ }
1450
+
1451
+ kernel void kernel_convert_block_mxfp4_trans4_ns(
1452
+ global struct block_mxfp4 * src0,
1453
+ __global uint * dst_q,
1454
+ __global uchar * dst_e,
1455
+ uint ne00,
1456
+ uint ne01
1457
+ ) {
1458
+ uint i00 = get_global_id(1);
1459
+ uint i01 = get_global_id(0);
1460
+ uint i02 = get_global_id(2);
1461
+
1462
+ if (i01 >= ne01) {
1463
+ return;
1464
+ }
1465
+
1466
+ uint ne00_blk = ne00 / QK_MXFP4;
1467
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1468
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1469
+
1470
+ global struct block_mxfp4 * b = src0 + src_blk_offset;
1471
+ dst_e[dst_blk_offset] = b->e;
1472
+
1473
+ // extract quantization and unshuffle
1474
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
1475
+
1476
+ ushort8 post_block = (ushort8)(0);
1477
+
1478
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
1479
+ uchar * post_block_ptr = (uchar *)(&post_block);
1480
+
1481
+ for (int i = 0; i < QK_MXFP4 / 4; ++i) {
1482
+ uchar x0 = pre_block_ptr[2*i + 0];
1483
+ uchar x1 = pre_block_ptr[2*i + 1];
1484
+
1485
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
1486
+ post_block_ptr[i + QK_MXFP4 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
1487
+ }
1488
+
1489
+ uint4 q_block = as_uint4(post_block);
1490
+
1491
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
1492
+ dst_q[offset] = q_block.x;
1493
+ dst_q[offset + ne01] = q_block.y;
1494
+ dst_q[offset + ne01 * 2] = q_block.z;
1495
+ dst_q[offset + ne01 * 3] = q_block.w;
1496
+ }
1497
+
1498
+ kernel void kernel_restore_block_mxfp4_trans4_ns(
1499
+ __global uint * src_q,
1500
+ __global uchar * src_e,
1501
+ __global struct block_mxfp4 * dst0,
1502
+ uint ne00,
1503
+ uint ne01
1504
+ ) {
1505
+ uint i00 = get_global_id(1);
1506
+ uint i01 = get_global_id(0);
1507
+ uint i02 = get_global_id(2);
1508
+
1509
+ if (i01 >= ne01) {
1510
+ return;
1511
+ }
1512
+
1513
+ uint ne00_blk = ne00 / QK_MXFP4;
1514
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1515
+ uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1516
+
1517
+ __global struct block_mxfp4 * b = dst0 + dst_blk_offset;
1518
+ b->e = src_e[src_d_offset];
1519
+
1520
+ // collect transposed quantization parts for a block
1521
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
1522
+ uint4 q_block;
1523
+ q_block.x = src_q[src_q_offset];
1524
+ q_block.y = src_q[src_q_offset + ne01];
1525
+ q_block.z = src_q[src_q_offset + ne01 * 2];
1526
+ q_block.w = src_q[src_q_offset + ne01 * 3];
1527
+
1528
+ ushort8 post_block = as_ushort8(q_block);
1529
+ ushort8 pre_block = (ushort8)(0);
1530
+
1531
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
1532
+ uchar * post_block_ptr = (uchar *)(&post_block);
1533
+
1534
+ for (int i = 0; i < QK_MXFP4 / 4; ++i) {
1535
+ uchar x0 = post_block_ptr[i + 0];
1536
+ uchar x1 = post_block_ptr[i + QK_MXFP4 / 4];
1537
+
1538
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
1539
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
1540
+ }
1541
+
1542
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
1543
+ }
1544
+
1545
+
1546
+ //------------------------------------------------------------------------------
1547
+ // block_q8_0
1548
+ //------------------------------------------------------------------------------
1549
+ typedef struct {
1550
+ half d; // delta
1551
+ char qs[QK8_0]; // quants
1552
+ } block_q8_0;
1553
+
1554
+ kernel void kernel_convert_block_q8_0(
1555
+ global block_q8_0 * src0,
1556
+ global uchar * dst_q,
1557
+ global half * dst_d
1558
+ ) {
1559
+ global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
1560
+ global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
1561
+ global half * d = (global half *) dst_d + get_global_id(0);
1562
+
1563
+ *d = b->d;
1564
+
1565
+ for (int i = 0; i < QK8_0; ++i) {
1566
+ q[i] = b->qs[i];
1567
+ }
1568
+ }
1569
+
1570
+ kernel void kernel_restore_block_q8_0(
1571
+ global uchar * src_q,
1572
+ global half * src_d,
1573
+ global block_q8_0 * dst
1574
+ ) {
1575
+ global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
1576
+ global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
1577
+ global half * d = (global half *) src_d + get_global_id(0);
1578
+
1579
+ b->d = *d;
1580
+ for (int i = 0; i < QK8_0; ++i) {
1581
+ b->qs[i] = q[i];
1582
+ }
1583
+ }
1584
+
1585
+ kernel void kernel_restore_block_q8_0_trans(
1586
+ global uchar * src_q,
1587
+ global half * src_d,
1588
+ global block_q8_0 * dst,
1589
+ uint ne00,
1590
+ uint ne01
1591
+ ){
1592
+ uint num_blk_per_row = ne00 / QK8_0;
1593
+
1594
+ global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
1595
+ global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
1596
+ global half * d = (global half *) src_d + get_global_id(0);
1597
+
1598
+ for (uint blk = 0; blk < num_blk_per_row; blk++) {
1599
+ b->d = *d;
1600
+
1601
+ for (uint i = 0; i < QK8_0; i+=4) {
1602
+ b->qs[i] = q[0];
1603
+ b->qs[i+1] = q[1];
1604
+ b->qs[i+2] = q[2];
1605
+ b->qs[i+3] = q[3];
1606
+
1607
+ q += 4 * ne01; // M stride
1608
+ }
1609
+
1610
+ d += ne01;
1611
+
1612
+ b++;
1613
+ }
1614
+ }
1615
+
1616
+ //------------------------------------------------------------------------------
1617
+ // kernel_convert_block_q4_K
1618
+ // Convert the block_q4_K format to 4 separate arrays (AOS -> SOA).
1619
+ // This kernel does not deshuffle the bits.
1620
+ // Each thread processes a super block.
1621
+ // Mask args are just to keep the signature consistent with the no-shuffle
1622
+ // version and they are not used in this kernel.
1623
+ //------------------------------------------------------------------------------
1624
+ kernel void kernel_convert_block_q4_K(
1625
+ global struct block_q4_K * src0,
1626
+ global uchar * dst_q,
1627
+ global uchar * dst_s,
1628
+ global half * dst_d,
1629
+ global half * dst_dm,
1630
+ uchar mask_0F,
1631
+ uchar mask_F0
1632
+ ) {
1633
+ global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
1634
+ global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
1635
+ global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0);
1636
+ global half * d = (global half *) dst_d + get_global_id(0);
1637
+ global half * dm = (global half *) dst_dm + get_global_id(0);
1638
+
1639
+ *d = b->d;
1640
+ *dm = b->dm;
1641
+
1642
+ for (int i = 0; i < QK_K/2; ++i) {
1643
+ q[i] = b->q[i];
1644
+ }
1645
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1646
+ s[i] = b->s[i];
1647
+ }
1648
+ }
1649
+
1650
+ // Restore block_q4_K from flattened arrays.
1651
+ // Each thread processes a super block.
1652
+ // Mask args are just to keep the signature consistent with the no-shuffle ones.
1653
+ kernel void kernel_restore_block_q4_K(
1654
+ global uchar * src_q,
1655
+ global uchar * src_s,
1656
+ global half * src_d,
1657
+ global half * src_dm,
1658
+ global struct block_q4_K * dst,
1659
+ uchar mask_0F,
1660
+ uchar mask_F0
1661
+ ) {
1662
+ global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
1663
+ global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
1664
+ global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0);
1665
+ global half * d = (global half *) src_d + get_global_id(0);
1666
+ global half * dm = (global half *) src_dm + get_global_id(0);
1667
+
1668
+ b->d = *d;
1669
+ b->dm = *dm;
1670
+
1671
+ for (int i = 0; i < QK_K/2; ++i) {
1672
+ b->q[i] = q[i];
1673
+ }
1674
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1675
+ b->s[i] = s[i];
1676
+ }
1677
+ }
1678
+
1679
+ kernel void kernel_convert_block_q4_K_noshuffle(
1680
+ global struct block_q4_K * src0,
1681
+ global uchar * dst_q,
1682
+ global uchar * dst_s,
1683
+ global half * dst_d,
1684
+ global half * dst_dm,
1685
+ uchar mask_0F,
1686
+ uchar mask_F0
1687
+ ) {
1688
+ global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
1689
+ global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0);
1690
+ global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0);
1691
+ global half * d = (global half *) dst_d + get_global_id(0);
1692
+ global half * dm = (global half *) dst_dm + get_global_id(0);
1693
+
1694
+ *d = b->d;
1695
+ *dm = b->dm;
1696
+
1697
+ for (int i = 0; i < QK_K / 64; ++i) {
1698
+ for (int j = 0; j < 16; ++j) {
1699
+ uchar x0 = b->q[i*32 + 2*j];
1700
+ uchar x1 = b->q[i*32 + 2*j + 1];
1701
+ q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
1702
+ q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
1703
+ }
1704
+ }
1705
+
1706
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1707
+ s[i] = b->s[i];
1708
+ }
1709
+ }
1710
+
1711
+ kernel void kernel_restore_block_q4_K_noshuffle(
1712
+ global uchar * src_q,
1713
+ global uchar * src_s,
1714
+ global half * src_d,
1715
+ global half * src_dm,
1716
+ global struct block_q4_K * dst,
1717
+ uchar mask_0F,
1718
+ uchar mask_F0
1719
+ ) {
1720
+ global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
1721
+ global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0);
1722
+ global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0);
1723
+ global half * d = (global half *) src_d + get_global_id(0);
1724
+ global half * dm = (global half *) src_dm + get_global_id(0);
1725
+
1726
+ b->d = *d;
1727
+ b->dm = *dm;
1728
+
1729
+ for (int i = 0; i < QK_K / 64; ++i) {
1730
+ for (int j = 0; j < 16; ++j) {
1731
+ uchar lo = q[i*32 + j];
1732
+ uchar hi = q[i*32 + j + 16];
1733
+ b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
1734
+ b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
1735
+ }
1736
+ }
1737
+
1738
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1739
+ b->s[i] = s[i];
1740
+ }
1741
+ }
1742
+
1743
+ //------------------------------------------------------------------------------
1744
+ // kernel_convert_block_q5_K
1745
+ // Convert the block_q5_K format to 5 separate arrays (AOS -> SOA).
1746
+ // Each thread processes a super block.
1747
+ //------------------------------------------------------------------------------
1748
+ kernel void kernel_convert_block_q5_K(
1749
+ global struct block_q5_K * src0,
1750
+ global uchar * dst_q,
1751
+ global uchar * dst_qh,
1752
+ global uchar * dst_s,
1753
+ global half * dst_d,
1754
+ global half * dst_dm,
1755
+ uchar mask_0F,
1756
+ uchar mask_F0
1757
+ ) {
1758
+ global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
1759
+ global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
1760
+ global uchar * qh = (global uchar *) dst_qh + QK_K/8*get_global_id(0);
1761
+ global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0);
1762
+ global half * d = (global half *) dst_d + get_global_id(0);
1763
+ global half * dm = (global half *) dst_dm + get_global_id(0);
1764
+
1765
+ *d = b->d;
1766
+ *dm = b->dm;
1767
+
1768
+ for (int i = 0; i < QK_K/2; ++i) {
1769
+ q[i] = b->qs[i];
1770
+ }
1771
+ for (int i = 0; i < QK_K/8; ++i) {
1772
+ qh[i] = b->qh[i];
1773
+ }
1774
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1775
+ s[i] = b->s[i];
1776
+ }
1777
+ }
1778
+
1779
+ // Restore block_q5_K from flattened arrays.
1780
+ // Each thread processes a super block.
1781
+ kernel void kernel_restore_block_q5_K(
1782
+ global uchar * src_q,
1783
+ global uchar * src_qh,
1784
+ global uchar * src_s,
1785
+ global half * src_d,
1786
+ global half * src_dm,
1787
+ global struct block_q5_K * dst,
1788
+ uchar mask_0F,
1789
+ uchar mask_F0
1790
+ ) {
1791
+ global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
1792
+ global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
1793
+ global uchar * qh = (global uchar *) src_qh + QK_K/8*get_global_id(0);
1794
+ global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0);
1795
+ global half * d = (global half *) src_d + get_global_id(0);
1796
+ global half * dm = (global half *) src_dm + get_global_id(0);
1797
+
1798
+ b->d = *d;
1799
+ b->dm = *dm;
1800
+
1801
+ for (int i = 0; i < QK_K/2; ++i) {
1802
+ b->qs[i] = q[i];
1803
+ }
1804
+ for (int i = 0; i < QK_K/8; ++i) {
1805
+ b->qh[i] = qh[i];
1806
+ }
1807
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1808
+ b->s[i] = s[i];
1809
+ }
1810
+ }
1811
+
1812
+ kernel void kernel_convert_block_q5_K_noshuffle(
1813
+ global struct block_q5_K * src0,
1814
+ global uchar * dst_q,
1815
+ global uchar * dst_qh,
1816
+ global uchar * dst_s,
1817
+ global half * dst_d,
1818
+ global half * dst_dm,
1819
+ uchar mask_0F,
1820
+ uchar mask_F0
1821
+ ) {
1822
+ global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
1823
+ global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0);
1824
+ global uchar * qh = (global uchar *) dst_qh + QK_K/8 * get_global_id(0);
1825
+ global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0);
1826
+ global half * d = (global half *) dst_d + get_global_id(0);
1827
+ global half * dm = (global half *) dst_dm + get_global_id(0);
1828
+
1829
+ *d = b->d;
1830
+ *dm = b->dm;
1831
+
1832
+ for (int i = 0; i < QK_K / 64; ++i) {
1833
+ for (int j = 0; j < 16; ++j) {
1834
+ uchar x0 = b->qs[i*32 + 2*j];
1835
+ uchar x1 = b->qs[i*32 + 2*j + 1];
1836
+ q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
1837
+ q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
1838
+ }
1839
+ }
1840
+
1841
+ for (int l = 0; l < QK_K/8; ++l) {
1842
+ uchar x0 = 0;
1843
+ for (int i = 0; i < 8; ++i) {
1844
+ x0 |= ((b->qh[(l%4)*8+i] >> (l/4)) & 0x01) << i;
1845
+ }
1846
+ qh[l] = x0;
1847
+ }
1848
+
1849
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1850
+ s[i] = b->s[i];
1851
+ }
1852
+ }
1853
+
1854
+ kernel void kernel_restore_block_q5_K_noshuffle(
1855
+ global uchar * src_q,
1856
+ global uchar * src_qh,
1857
+ global uchar * src_s,
1858
+ global half * src_d,
1859
+ global half * src_dm,
1860
+ global struct block_q5_K * dst,
1861
+ uchar mask_0F,
1862
+ uchar mask_F0
1863
+ ) {
1864
+ global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
1865
+ global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0);
1866
+ global uchar * qh = (global uchar *) src_qh + QK_K/8 * get_global_id(0);
1867
+ global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0);
1868
+ global half * d = (global half *) src_d + get_global_id(0);
1869
+ global half * dm = (global half *) src_dm + get_global_id(0);
1870
+
1871
+ b->d = *d;
1872
+ b->dm = *dm;
1873
+
1874
+ for (int i = 0; i < QK_K / 64; ++i) {
1875
+ for (int j = 0; j < 16; ++j) {
1876
+ uchar lo = q[i*32 + j];
1877
+ uchar hi = q[i*32 + j + 16];
1878
+ b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
1879
+ b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
1880
+ }
1881
+ }
1882
+
1883
+ for (int g = 0; g < 4; ++g) {
1884
+ for (int i = 0; i < 8; ++i) {
1885
+ uchar x0 = 0;
1886
+ for (int k = 0; k < 8; ++k) {
1887
+ x0 |= ((qh[4*k+g] >> i) & 0x01) << k;
1888
+ }
1889
+ b->qh[g*8+i] = x0;
1890
+ }
1891
+ }
1892
+
1893
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1894
+ b->s[i] = s[i];
1895
+ }
1896
+ }
1897
+
1898
+ //------------------------------------------------------------------------------
1899
+ // kernel_convert_block_q6_K
1900
+ // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
1901
+ // This kernel does not deshuffle the bits.
1902
+ // Each thread processes a super block.
1903
+ //------------------------------------------------------------------------------
1904
+ kernel void kernel_convert_block_q6_K(
1905
+ global struct block_q6_K * src0,
1906
+ global uchar * dst_ql,
1907
+ global uchar * dst_qh,
1908
+ global char * dst_s,
1909
+ global half * dst_d,
1910
+ uchar mask_lsb_8,
1911
+ ulong n_blk
1912
+ ) {
1913
+ if (get_global_id(0) >= n_blk) {
1914
+ return;
1915
+ }
1916
+ global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
1917
+ global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
1918
+ global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
1919
+ global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
1920
+ global half * d = (global half *) dst_d + get_global_id(0);
1921
+
1922
+ *d = b->d;
1923
+
1924
+ for (int i = 0; i < QK_K/2; ++i) {
1925
+ ql[i] = b->ql[i];
1926
+ }
1927
+ for (int i = 0; i < QK_K/4; ++i) {
1928
+ qh[i] = b->qh[i];
1929
+ }
1930
+ for (int i = 0; i < QK_K/16; ++i) {
1931
+ s[i] = b->scales[i];
1932
+ }
1933
+ }
1934
+
1935
+ // Restore block_q6_K from flattened arrays.
1936
+ // Each thread processes a super block.
1937
+ kernel void kernel_restore_block_q6_K(
1938
+ global uchar * dst_ql,
1939
+ global uchar * dst_qh,
1940
+ global char * dst_s,
1941
+ global half * dst_d,
1942
+ global struct block_q6_K * dst,
1943
+ uchar mask_lsb_8,
1944
+ ulong n_blk
1945
+ ) {
1946
+ if (get_global_id(0) >= n_blk) {
1947
+ return;
1948
+ }
1949
+ global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
1950
+ global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
1951
+ global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
1952
+ global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
1953
+ global half * d = (global half *) dst_d + get_global_id(0);
1954
+
1955
+ b->d = *d;
1956
+
1957
+ for (int i = 0; i < QK_K/2; ++i) {
1958
+ b->ql[i] = ql[i];
1959
+ }
1960
+ for (int i = 0; i < QK_K/4; ++i) {
1961
+ b->qh[i] = qh[i];
1962
+ }
1963
+ for (int i = 0; i < QK_K/16; ++i) {
1964
+ b->scales[i] = s[i];
1965
+ }
1966
+ }
1967
+
1968
+ kernel void kernel_convert_block_q6_K_noshuffle(
1969
+ global struct block_q6_K * src0,
1970
+ global uchar * dst_ql,
1971
+ global uchar * dst_qh,
1972
+ global char * dst_s,
1973
+ global half * dst_d,
1974
+ uchar mask_lsb_8,
1975
+ ulong n_blk
1976
+ ) {
1977
+ if (get_global_id(0) >= n_blk) {
1978
+ return;
1979
+ }
1980
+ global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
1981
+ global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
1982
+ global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
1983
+ global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
1984
+ global half * d = (global half *) dst_d + get_global_id(0);
1985
+
1986
+ *d = b->d;
1987
+
1988
+ for (int i = 0; i < QK_K/2/4; ++i) {
1989
+ uchar x0 = b->ql[i*2 + 0] & mask_lsb_8;
1990
+ uchar x1 = b->ql[i*2 + 1] & mask_lsb_8;
1991
+ ql[i + 0] = (x0 & 0x0F) | ((x1 & 0x0F) << 4);
1992
+ ql[i + 32] = ((x0 & 0xF0) >> 4) | (x1 & 0xF0);
1993
+
1994
+ uchar x2 = b->ql[i*2 + 0 + 64] & mask_lsb_8;
1995
+ uchar x3 = b->ql[i*2 + 1 + 64] & mask_lsb_8;
1996
+ ql[i + 64] = (x2 & 0x0F) | ((x3 & 0x0F) << 4);
1997
+ ql[i + 96] = ((x2 & 0xF0) >> 4) | (x3 & 0xF0);
1998
+ }
1999
+
2000
+ for (int i = 0; i < QK_K/4/8; ++i) {
2001
+ uchar x0 = b->qh[i*4 + 0] & mask_lsb_8;
2002
+ uchar x1 = b->qh[i*4 + 1] & mask_lsb_8;
2003
+ uchar x2 = b->qh[i*4 + 2] & mask_lsb_8;
2004
+ uchar x3 = b->qh[i*4 + 3] & mask_lsb_8;
2005
+ qh[i + 0] = (x0 & 0x03) | ((x1 & 0x03) << 2) | ((x2 & 0x03) << 4) | ((x3 & 0x03) << 6);
2006
+ qh[i + 8] = ((x0 & 0x0C) >> 2) | (x1 & 0x0C) | ((x2 & 0x0C) << 2) | ((x3 & 0x0C) << 4);
2007
+ qh[i + 16] = ((x0 & 0x30) >> 4) | ((x1 & 0x30) >> 2) | (x2 & 0x30) | ((x3 & 0x30) << 2);
2008
+ qh[i + 24] = ((x0 & 0xC0) >> 6) | ((x1 & 0xC0) >> 4) | ((x2 & 0xC0) >> 2) | (x3 & 0xC0);
2009
+
2010
+ uchar x4 = b->qh[i*4 + 0 + 32] & mask_lsb_8;
2011
+ uchar x5 = b->qh[i*4 + 1 + 32] & mask_lsb_8;
2012
+ uchar x6 = b->qh[i*4 + 2 + 32] & mask_lsb_8;
2013
+ uchar x7 = b->qh[i*4 + 3 + 32] & mask_lsb_8;
2014
+ qh[i + 32] = (x4 & 0x03) | ((x5 & 0x03) << 2) | ((x6 & 0x03) << 4) | ((x7 & 0x03) << 6);
2015
+ qh[i + 40] = ((x4 & 0x0C) >> 2) | (x5 & 0x0C) | ((x6 & 0x0C) << 2) | ((x7 & 0x0C) << 4);
2016
+ qh[i + 48] = ((x4 & 0x30) >> 4) | ((x5 & 0x30) >> 2) | (x6 & 0x30) | ((x7 & 0x30) << 2);
2017
+ qh[i + 56] = ((x4 & 0xC0) >> 6) | ((x5 & 0xC0) >> 4) | ((x6 & 0xC0) >> 2) | (x7 & 0xC0);
2018
+ }
2019
+
2020
+ for (int i = 0; i < QK_K/16; ++i) {
2021
+ s[i] = b->scales[i];
2022
+ }
2023
+ }
2024
+
2025
+ kernel void kernel_restore_block_q6_K_noshuffle(
2026
+ global uchar * src_ql,
2027
+ global uchar * src_qh,
2028
+ global char * src_s,
2029
+ global half * src_d,
2030
+ global struct block_q6_K * dst,
2031
+ uchar mask_lsb_8,
2032
+ ulong n_blk
2033
+ ) {
2034
+ if (get_global_id(0) >= n_blk) {
2035
+ return;
2036
+ }
2037
+ global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
2038
+ global uchar * ql = (global uchar *) src_ql + QK_K/2*get_global_id(0);
2039
+ global uchar * qh = (global uchar *) src_qh + QK_K/4*get_global_id(0);
2040
+ global char * s = (global char *) src_s + QK_K/16*get_global_id(0);
2041
+ global half * d = (global half *) src_d + get_global_id(0);
2042
+
2043
+ b->d = *d;
2044
+
2045
+ for (int i = 0; i < QK_K/2/4; ++i) {
2046
+ uchar x0 = ql[i + 0] & mask_lsb_8;
2047
+ uchar x1 = ql[i + 32] & mask_lsb_8;
2048
+ b->ql[i*2 + 0] = (x0 & 0x0F) | ((x1 & 0x0F) << 4);
2049
+ b->ql[i*2 + 1] = ((x0 & 0xF0) >> 4) | (x1 & 0xF0);
2050
+
2051
+ uchar x2 = ql[i + 64] & mask_lsb_8;
2052
+ uchar x3 = ql[i + 96] & mask_lsb_8;
2053
+ b->ql[i*2 + 0 + 64] = (x2 & 0x0F) | ((x3 & 0x0F) << 4);
2054
+ b->ql[i*2 + 1 + 64] = ((x2 & 0xF0) >> 4) | (x3 & 0xF0);
2055
+ }
2056
+
2057
+ for (int i = 0; i < QK_K/4/8; ++i) {
2058
+ uchar x0 = qh[i + 0] & mask_lsb_8;
2059
+ uchar x1 = qh[i + 8] & mask_lsb_8;
2060
+ uchar x2 = qh[i + 16] & mask_lsb_8;
2061
+ uchar x3 = qh[i + 24] & mask_lsb_8;
2062
+ b->qh[i*4 + 0] = (x0 & 0x03) | ((x1 & 0x03) << 2) | ((x2 & 0x03) << 4) | ((x3 & 0x03) << 6);
2063
+ b->qh[i*4 + 1] = ((x0 & 0x0C) >> 2) | (x1 & 0x0C) | ((x2 & 0x0C) << 2) | ((x3 & 0x0C) << 4);
2064
+ b->qh[i*4 + 2] = ((x0 & 0x30) >> 4) | ((x1 & 0x30) >> 2) | (x2 & 0x30) | ((x3 & 0x30) << 2);
2065
+ b->qh[i*4 + 3] = ((x0 & 0xC0) >> 6) | ((x1 & 0xC0) >> 4) | ((x2 & 0xC0) >> 2) | (x3 & 0xC0);
2066
+
2067
+ uchar x4 = qh[i + 0 + 32] & mask_lsb_8;
2068
+ uchar x5 = qh[i + 8 + 32] & mask_lsb_8;
2069
+ uchar x6 = qh[i + 16 + 32] & mask_lsb_8;
2070
+ uchar x7 = qh[i + 24 + 32] & mask_lsb_8;
2071
+ b->qh[i*4 + 0 + 32] = (x4 & 0x03) | ((x5 & 0x03) << 2) | ((x6 & 0x03) << 4) | ((x7 & 0x03) << 6);
2072
+ b->qh[i*4 + 1 + 32] = ((x4 & 0x0C) >> 2) | (x5 & 0x0C) | ((x6 & 0x0C) << 2) | ((x7 & 0x0C) << 4);
2073
+ b->qh[i*4 + 2 + 32] = ((x4 & 0x30) >> 4) | ((x5 & 0x30) >> 2) | (x6 & 0x30) | ((x7 & 0x30) << 2);
2074
+ b->qh[i*4 + 3 + 32] = ((x4 & 0xC0) >> 6) | ((x5 & 0xC0) >> 4) | ((x6 & 0xC0) >> 2) | (x7 & 0xC0);
2075
+ }
2076
+
2077
+ for (int i = 0; i < QK_K/16; ++i) {
2078
+ b->scales[i] = s[i];
2079
+ }
2080
+ }
2081
+
2082
+ //------------------------------------------------------------------------------
2083
+ // kernel_convert_block_iq4_nl
2084
+ // Convert the block_iq4_nl format to 2 separate arrays (AOS -> SOA).
2085
+ //------------------------------------------------------------------------------
2086
+ kernel void kernel_convert_block_iq4_nl(
2087
+ global struct block_iq4_nl * src0,
2088
+ global uchar * dst_q,
2089
+ global half * dst_d,
2090
+ uchar mask_0F,
2091
+ uchar mask_F0,
2092
+ ulong n_blk
2093
+ ) {
2094
+ if (get_global_id(0) >= n_blk) {
2095
+ return;
2096
+ }
2097
+ global struct block_iq4_nl * b = (global struct block_iq4_nl *) src0 + get_global_id(0);
2098
+ global uchar * q = (global uchar *) dst_q + QK4_NL/2*get_global_id(0);
2099
+ global half * d = (global half *) dst_d + get_global_id(0);
2100
+
2101
+ *d = b->d;
2102
+
2103
+ for (int i = 0; i < QK4_NL/2; ++i) {
2104
+ q[i] = b->qs[i];
2105
+ }
2106
+ }
2107
+
2108
+ kernel void kernel_restore_block_iq4_nl(
2109
+ global uchar * src_q,
2110
+ global half * src_d,
2111
+ global struct block_iq4_nl * dst,
2112
+ ulong n_blk
2113
+ ) {
2114
+ if (get_global_id(0) >= n_blk) {
2115
+ return;
2116
+ }
2117
+ global struct block_iq4_nl * b = (global struct block_iq4_nl *) dst + get_global_id(0);
2118
+ global uchar * q = (global uchar *) src_q + QK4_NL/2*get_global_id(0);
2119
+ global half * d = (global half *) src_d + get_global_id(0);
2120
+
2121
+ b->d = *d;
2122
+
2123
+ for (int i = 0; i < QK4_NL/2; ++i) {
2124
+ b->qs[i] = q[i];
2125
+ }
2126
+ }
2127
+
2128
+ kernel void kernel_convert_block_iq4_nl_noshuffle(
2129
+ global struct block_iq4_nl * src0,
2130
+ global uchar * dst_q,
2131
+ global half * dst_d,
2132
+ uchar mask_0F,
2133
+ uchar mask_F0,
2134
+ ulong n_blk
2135
+ ) {
2136
+ if (get_global_id(0) >= n_blk) {
2137
+ return;
2138
+ }
2139
+ global struct block_iq4_nl * b = (global struct block_iq4_nl *) src0 + get_global_id(0);
2140
+ global uchar * q = (global uchar *) dst_q + QK4_NL/2*get_global_id(0);
2141
+ global half * d = (global half *) dst_d + get_global_id(0);
2142
+
2143
+ *d = b->d;
2144
+ for (int i = 0; i < QK4_NL/4; ++i) {
2145
+ uchar x0 = b->qs[2*i + 0];
2146
+ uchar x1 = b->qs[2*i + 1];
2147
+
2148
+ q[i + 0 ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
2149
+ q[i + QK4_NL/4] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
2150
+ }
2151
+ }
2152
+
2153
+ kernel void kernel_restore_block_iq4_nl_noshuffle(
2154
+ global uchar * src_q,
2155
+ global half * src_d,
2156
+ global struct block_iq4_nl * dst,
2157
+ uchar mask_0F,
2158
+ uchar mask_F0,
2159
+ ulong n_blk
2160
+ ) {
2161
+ if (get_global_id(0) >= n_blk) {
2162
+ return;
2163
+ }
2164
+ global struct block_iq4_nl * b = (global struct block_iq4_nl *) dst + get_global_id(0);
2165
+ global uchar * q = (global uchar *) src_q + QK4_NL/2*get_global_id(0);
2166
+ global half * d = (global half *) src_d + get_global_id(0);
2167
+
2168
+ b->d = *d;
2169
+ for (int i = 0; i < QK4_NL/4; ++i) {
2170
+ uchar x0 = q[i + 0 ];
2171
+ uchar x1 = q[i + QK4_NL/4];
2172
+
2173
+ b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
2174
+ b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
264
2175
  }
265
2176
  }