whispercpp 1.3.6 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (828) hide show
  1. checksums.yaml +4 -4
  2. data/.document +3 -0
  3. data/.rdoc_options +2 -0
  4. data/README.md +38 -5
  5. data/Rakefile +18 -3
  6. data/ext/dependencies.rb +10 -4
  7. data/ext/dependencies_for_windows.rb +17 -0
  8. data/ext/extconf.rb +20 -8
  9. data/ext/options.rb +54 -14
  10. data/ext/options_for_windows.rb +51 -0
  11. data/ext/ruby_whisper.c +36 -42
  12. data/ext/ruby_whisper.h +135 -0
  13. data/ext/ruby_whisper_context.c +107 -28
  14. data/ext/ruby_whisper_log_queue.c +180 -0
  15. data/ext/ruby_whisper_log_settable.h +47 -0
  16. data/ext/ruby_whisper_parakeet.c +49 -0
  17. data/ext/ruby_whisper_parakeet_context.c +304 -0
  18. data/ext/ruby_whisper_parakeet_context_params.c +117 -0
  19. data/ext/ruby_whisper_parakeet_model.c +84 -0
  20. data/ext/ruby_whisper_parakeet_params.c +548 -0
  21. data/ext/ruby_whisper_parakeet_segment.c +157 -0
  22. data/ext/ruby_whisper_parakeet_token.c +188 -0
  23. data/ext/ruby_whisper_parakeet_transcribe.cpp +58 -0
  24. data/ext/ruby_whisper_params.c +256 -65
  25. data/ext/ruby_whisper_segment.c +6 -6
  26. data/ext/ruby_whisper_transcribe.cpp +42 -15
  27. data/ext/sources/CMakeLists.txt +41 -3
  28. data/ext/sources/CMakePresets.json +95 -0
  29. data/ext/sources/cmake/parakeet-config.cmake.in +30 -0
  30. data/ext/sources/cmake/parakeet.pc.in +10 -0
  31. data/ext/sources/cmake/whisper.pc.in +1 -1
  32. data/ext/sources/examples/CMakeLists.txt +4 -2
  33. data/ext/sources/examples/bench/bench.cpp +1 -1
  34. data/ext/sources/examples/cli/cli.cpp +43 -9
  35. data/ext/sources/examples/common-ggml.cpp +2 -0
  36. data/ext/sources/examples/common-whisper.cpp +139 -67
  37. data/ext/sources/examples/common-whisper.h +11 -0
  38. data/ext/sources/examples/ffmpeg-transcode.cpp +211 -341
  39. data/ext/sources/examples/parakeet-cli/CMakeLists.txt +8 -0
  40. data/ext/sources/examples/parakeet-cli/parakeet-cli.cpp +243 -0
  41. data/ext/sources/examples/parakeet-quantize/CMakeLists.txt +7 -0
  42. data/ext/sources/examples/parakeet-quantize/parakeet-quantize.cpp +230 -0
  43. data/ext/sources/examples/server/server.cpp +199 -163
  44. data/ext/sources/ggml/CMakeLists.txt +21 -13
  45. data/ext/sources/ggml/cmake/FindNCCL.cmake +36 -0
  46. data/ext/sources/ggml/cmake/ggml-config.cmake.in +12 -2
  47. data/ext/sources/ggml/include/ggml-alloc.h +1 -0
  48. data/ext/sources/ggml/include/ggml-backend.h +72 -10
  49. data/ext/sources/ggml/include/ggml-cuda.h +3 -0
  50. data/ext/sources/ggml/include/ggml-rpc.h +3 -3
  51. data/ext/sources/ggml/include/ggml.h +101 -9
  52. data/ext/sources/ggml/include/gguf.h +10 -2
  53. data/ext/sources/ggml/src/CMakeLists.txt +22 -5
  54. data/ext/sources/ggml/src/ggml-alloc.c +5 -1
  55. data/ext/sources/ggml/src/ggml-backend-impl.h +22 -2
  56. data/ext/sources/ggml/src/ggml-backend-meta.cpp +2263 -0
  57. data/ext/sources/ggml/src/ggml-backend-reg.cpp +12 -0
  58. data/ext/sources/ggml/src/ggml-backend.cpp +110 -9
  59. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +4 -0
  60. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +672 -257
  61. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +71 -0
  62. data/ext/sources/ggml/src/ggml-cann/common.h +20 -10
  63. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +211 -30
  64. data/ext/sources/ggml/src/ggml-common.h +11 -0
  65. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +58 -29
  66. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +2 -0
  67. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +16 -16
  68. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +116 -7
  69. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +65 -0
  70. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +151 -1
  71. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +0 -1
  72. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +4279 -1292
  73. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +5 -35
  74. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +0 -1
  75. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +72 -1
  76. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +177 -27
  77. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +1 -1
  78. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +5 -0
  79. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  80. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -0
  81. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +95 -5
  82. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  83. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +146 -134
  84. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +88 -70
  85. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +372 -73
  86. data/ext/sources/ggml/src/ggml-cpu/ops.h +3 -0
  87. data/ext/sources/ggml/src/ggml-cpu/quants.c +55 -0
  88. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  89. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +3 -0
  90. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +90 -0
  91. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +3 -16
  92. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1402 -687
  93. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +8 -0
  94. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +597 -2766
  95. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  96. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  97. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  98. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +182 -19
  99. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  100. data/ext/sources/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  101. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  102. data/ext/sources/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  103. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  104. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  105. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  106. data/ext/sources/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  107. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +37 -53
  108. data/ext/sources/ggml/src/ggml-cpu/vec.h +225 -240
  109. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +17 -7
  110. data/ext/sources/ggml/src/ggml-cuda/allreduce.cu +971 -0
  111. data/ext/sources/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  112. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +62 -26
  113. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +44 -18
  114. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +1 -0
  115. data/ext/sources/ggml/src/ggml-cuda/common.cuh +242 -28
  116. data/ext/sources/ggml/src/ggml-cuda/concat.cu +120 -114
  117. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +45 -21
  118. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +1 -0
  119. data/ext/sources/ggml/src/ggml-cuda/convert.cu +53 -0
  120. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +10 -0
  121. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +14 -6
  122. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +22 -0
  123. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +278 -44
  124. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +331 -130
  125. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +12 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +126 -27
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +40 -15
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +18 -9
  129. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +152 -49
  130. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  131. data/ext/sources/ggml/src/ggml-cuda/fwht.cu +101 -0
  132. data/ext/sources/ggml/src/ggml-cuda/fwht.cuh +4 -0
  133. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +84 -35
  134. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +34 -12
  135. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1069 -609
  136. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +32 -29
  137. data/ext/sources/ggml/src/ggml-cuda/mean.cu +4 -2
  138. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +242 -195
  139. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +3 -3
  140. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +18 -12
  141. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +502 -423
  142. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +19 -12
  143. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +485 -57
  144. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +6 -1
  145. data/ext/sources/ggml/src/ggml-cuda/norm.cu +36 -10
  146. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +23 -7
  147. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +133 -26
  148. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +1 -1
  149. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +5 -1
  150. data/ext/sources/ggml/src/ggml-cuda/rope.cu +11 -4
  151. data/ext/sources/ggml/src/ggml-cuda/scale.cu +4 -1
  152. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +14 -6
  153. data/ext/sources/ggml/src/ggml-cuda/snake.cu +72 -0
  154. data/ext/sources/ggml/src/ggml-cuda/snake.cuh +8 -0
  155. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +4 -1
  156. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +45 -13
  157. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  158. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +40 -18
  159. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +8 -4
  160. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +1 -0
  161. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +1 -0
  162. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +2 -0
  163. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  164. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +1 -0
  165. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +1 -0
  166. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  167. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +2 -0
  168. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +1 -0
  169. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  170. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +2 -0
  171. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  172. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +2 -0
  173. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  174. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  176. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  177. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  178. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  179. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  191. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +5 -4
  192. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +26 -23
  193. data/ext/sources/ggml/src/ggml-cuda/unary.cu +31 -2
  194. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  195. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +80 -0
  196. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +7 -2
  197. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +22 -4
  198. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  199. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +2 -1
  200. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +1428 -743
  201. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -7
  202. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +53 -84
  203. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +25 -12
  204. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +165 -184
  205. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +5 -5
  206. data/ext/sources/ggml/src/ggml-hexagon/htp/concat-ops.c +277 -0
  207. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +170 -127
  208. data/ext/sources/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  209. data/ext/sources/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  210. data/ext/sources/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  211. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +125 -97
  212. data/ext/sources/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +1148 -0
  213. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +148 -42
  214. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.c +2 -2
  215. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dma.h +252 -62
  216. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +9 -0
  217. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +87 -1
  218. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1878 -0
  219. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +2066 -0
  220. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.c +6 -0
  221. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-ops.h +88 -0
  222. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  223. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  224. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  225. data/ext/sources/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  226. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +96 -13
  227. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +182 -57
  228. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +9 -3
  229. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +71 -3
  230. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +27 -10
  231. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +63 -23
  232. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +9 -8
  233. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-flash-attn.h +47 -0
  234. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-log.h +65 -0
  235. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-pow.h +42 -0
  236. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  237. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +1 -0
  238. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h +90 -0
  239. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +5 -8
  240. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +529 -815
  241. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2522 -234
  242. data/ext/sources/ggml/src/ggml-hexagon/htp/pad-ops.c +547 -0
  243. data/ext/sources/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  244. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +291 -95
  245. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +59 -37
  246. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +121 -133
  247. data/ext/sources/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  248. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +244 -151
  249. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +6 -6
  250. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +719 -45
  251. data/ext/sources/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  252. data/ext/sources/ggml/src/ggml-hexagon/htp-opnode.h +272 -0
  253. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +3 -1
  254. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +22 -9
  255. data/ext/sources/ggml/src/ggml-impl.h +6 -1
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +138 -13
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +32 -1
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +164 -28
  259. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +80 -0
  260. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +190 -19
  261. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +2 -0
  262. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +39 -26
  263. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +823 -322
  264. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +5 -6
  265. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +54 -5
  266. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +12248 -5907
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +67 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +59 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +1819 -112
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/gated_delta_net.cl +249 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +306 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +256 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +258 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl +283 -0
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +260 -0
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +262 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl +288 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl +267 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mat_Ab_Bi_8x4.cl → gemm_noshuffle_q4_0_f32.cl} +1 -1
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl +131 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl +134 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mm_q8_0_f32_8x4.cl → gemm_noshuffle_q8_0_f32.cl} +1 -1
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +165 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +120 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +123 -0
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl +155 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +123 -0
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +125 -0
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl +160 -0
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl +141 -0
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general.cl → gemv_noshuffle_q4_0_f32.cl} +5 -5
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle.cl → gemv_noshuffle_q4_0_f32_spec.cl} +5 -5
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl +291 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl +294 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +15 -9
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl +173 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl +175 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl +241 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl +243 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl +243 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl +247 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +48 -64
  322. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +15 -5
  323. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +18 -11
  324. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +35 -13
  325. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +264 -192
  326. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +33 -7
  327. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  328. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +1 -0
  329. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +1 -0
  330. data/ext/sources/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  331. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +27 -3
  332. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +67 -36
  333. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +1 -0
  334. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +101 -44
  335. data/ext/sources/ggml/src/ggml-openvino/utils.h +23 -3
  336. data/ext/sources/ggml/src/ggml-opt.cpp +1 -0
  337. data/ext/sources/ggml/src/ggml-quants.c +289 -114
  338. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  339. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +24 -0
  340. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +167 -311
  341. data/ext/sources/ggml/src/ggml-rpc/transport.cpp +683 -0
  342. data/ext/sources/ggml/src/ggml-rpc/transport.h +34 -0
  343. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +50 -4
  344. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +1 -1
  345. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +3 -1
  346. data/ext/sources/ggml/src/ggml-sycl/common.cpp +74 -2
  347. data/ext/sources/ggml/src/ggml-sycl/common.hpp +41 -1
  348. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +115 -13
  349. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +9 -0
  350. data/ext/sources/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  351. data/ext/sources/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  352. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +663 -0
  353. data/ext/sources/ggml/src/ggml-sycl/diag.cpp +67 -0
  354. data/ext/sources/ggml/src/ggml-sycl/diag.hpp +5 -0
  355. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +586 -6
  356. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1 -90
  357. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +0 -2
  358. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  359. data/ext/sources/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  360. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +7 -5
  361. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +4 -0
  362. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +76 -168
  363. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +7 -0
  364. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +3 -1
  365. data/ext/sources/ggml/src/ggml-sycl/fill.cpp +55 -0
  366. data/ext/sources/ggml/src/ggml-sycl/fill.hpp +5 -0
  367. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +69 -31
  368. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +1 -0
  369. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  370. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +79 -3
  371. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +823 -190
  372. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +353 -89
  373. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +5 -3
  374. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1344 -26
  375. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +16 -0
  376. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +27 -27
  377. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +71 -0
  378. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +7 -1
  379. data/ext/sources/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  380. data/ext/sources/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  381. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +6 -1
  382. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  383. data/ext/sources/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  384. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +62 -10
  385. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +18 -6
  386. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  387. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +1 -0
  388. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +1 -0
  389. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +1 -0
  390. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +1 -0
  391. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +1 -0
  392. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +1 -0
  393. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +1 -0
  394. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +1 -0
  395. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +1 -0
  396. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +1 -0
  397. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +1 -0
  398. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +1 -0
  399. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +1 -0
  400. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +1 -0
  401. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +1 -0
  402. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +1 -0
  403. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +1 -0
  404. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +1 -0
  405. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +1 -0
  406. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +1 -0
  407. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +1 -0
  408. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +1 -0
  409. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +1 -0
  410. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +1 -0
  411. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +1 -0
  412. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +1 -0
  413. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +1 -0
  414. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +1 -0
  415. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +1 -0
  416. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +1 -0
  417. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +1 -0
  418. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +1 -0
  420. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +1 -0
  421. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +1 -0
  422. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +1 -0
  423. data/ext/sources/ggml/src/ggml-sycl/type.hpp +112 -0
  424. data/ext/sources/ggml/src/ggml-sycl/upscale.cpp +410 -0
  425. data/ext/sources/ggml/src/ggml-sycl/upscale.hpp +9 -0
  426. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +215 -53
  427. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +4 -0
  428. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +2 -0
  429. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +2 -0
  430. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +1 -0
  431. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +1 -0
  432. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +0 -2
  433. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +11 -0
  434. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +2060 -535
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +6 -2
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +146 -13
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +3 -1
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +1 -1
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +25 -1
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +88 -0
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +643 -1
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +0 -1
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dot_product_funcs.glsl +27 -0
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +0 -1
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp +7 -0
  449. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +197 -48
  450. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +60 -59
  451. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +115 -113
  452. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +122 -31
  453. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +131 -0
  454. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  455. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +115 -0
  456. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +125 -64
  457. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +0 -1
  458. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +10 -1
  459. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +16 -6
  460. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +76 -54
  461. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +0 -1
  462. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +0 -1
  463. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +122 -27
  464. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +6 -6
  465. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +1 -1
  466. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +1 -1
  467. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +1 -1
  468. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +1 -0
  469. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +88 -55
  470. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -17
  471. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +43 -10
  472. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +159 -125
  473. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +8 -8
  474. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +24 -9
  475. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +0 -1
  476. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +5 -2
  477. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +0 -1
  478. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +3 -2
  479. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp +49 -0
  480. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +11 -1
  481. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +0 -1
  482. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +79 -2
  483. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +171 -147
  484. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +5 -2
  485. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +2202 -283
  486. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +2610 -1403
  487. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +37 -7
  488. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  489. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +8 -7
  490. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +76 -95
  491. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +19 -1
  492. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  493. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{cpy.tmpl.wgsl → cpy.wgsl} +25 -50
  494. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +107 -184
  495. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_quant_staging.tmpl +124 -0
  496. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +397 -0
  497. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  498. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  499. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +619 -0
  500. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +149 -0
  501. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +183 -78
  502. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  503. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  504. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +655 -495
  505. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  506. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +52 -0
  507. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  508. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +8 -6
  509. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +5 -1
  510. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +80 -409
  511. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1432 -0
  512. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +303 -0
  513. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quant_inner_loops.tmpl +21 -0
  514. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +173 -0
  515. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  516. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{rope.tmpl.wgsl → rope.wgsl} +71 -142
  517. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  518. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +6 -4
  519. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  520. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +2 -3
  521. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl +224 -0
  522. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{soft_max.tmpl.wgsl → soft_max.wgsl} +106 -206
  523. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  524. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  525. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  526. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +68 -48
  527. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  528. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +18 -14
  529. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +1 -1
  530. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +244 -10
  531. data/ext/sources/ggml/src/ggml.c +110 -28
  532. data/ext/sources/ggml/src/gguf.cpp +173 -28
  533. data/ext/sources/include/parakeet.h +342 -0
  534. data/ext/sources/include/whisper.h +10 -0
  535. data/ext/sources/media/matmul.png +0 -0
  536. data/ext/sources/src/CMakeLists.txt +23 -0
  537. data/ext/sources/src/parakeet-arch.h +188 -0
  538. data/ext/sources/src/parakeet.cpp +3838 -0
  539. data/ext/sources/src/whisper.cpp +56 -12
  540. data/extsources.rb +26 -10
  541. data/lib/whisper/log_settable.rb +36 -0
  542. data/lib/whisper/model/uri.rb +13 -1
  543. data/lib/whisper/output.rb +74 -0
  544. data/sig/whisper.rbs +411 -62
  545. data/test/helper.rb +2 -0
  546. data/test/jfk_reader/jfk_reader.c +50 -7
  547. data/test/test_callback.rb +1 -0
  548. data/test/test_package.rb +6 -5
  549. data/test/test_parakeet.rb +28 -0
  550. data/test/test_parakeet_callback.rb +107 -0
  551. data/test/test_parakeet_context.rb +116 -0
  552. data/test/test_parakeet_context_params.rb +24 -0
  553. data/test/test_parakeet_model.rb +21 -0
  554. data/test/test_parakeet_params.rb +78 -0
  555. data/test/test_parakeet_segment.rb +42 -0
  556. data/test/test_parakeet_token.rb +73 -0
  557. data/test/test_params.rb +2 -0
  558. data/test/test_vad_segment.rb +1 -1
  559. data/test/test_whisper.rb +24 -6
  560. data/whispercpp.gemspec +2 -2
  561. metadata +215 -281
  562. data/ext/sources/bindings/javascript/CMakeLists.txt +0 -41
  563. data/ext/sources/bindings/javascript/emscripten.cpp +0 -93
  564. data/ext/sources/bindings/javascript/libwhisper.worker.js +0 -1
  565. data/ext/sources/bindings/javascript/package.json +0 -26
  566. data/ext/sources/bindings/javascript/whisper.js +0 -19
  567. data/ext/sources/examples/addon.node/CMakeLists.txt +0 -31
  568. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +0 -133
  569. data/ext/sources/examples/addon.node/addon.cpp +0 -557
  570. data/ext/sources/examples/addon.node/index.js +0 -59
  571. data/ext/sources/examples/addon.node/package.json +0 -16
  572. data/ext/sources/examples/addon.node/vad-example.js +0 -132
  573. data/ext/sources/examples/bench.wasm/CMakeLists.txt +0 -49
  574. data/ext/sources/examples/bench.wasm/emscripten.cpp +0 -87
  575. data/ext/sources/examples/bench.wasm/index-tmpl.html +0 -285
  576. data/ext/sources/examples/coi-serviceworker.js +0 -146
  577. data/ext/sources/examples/command/CMakeLists.txt +0 -10
  578. data/ext/sources/examples/command/command.cpp +0 -802
  579. data/ext/sources/examples/command/commands.txt +0 -9
  580. data/ext/sources/examples/command.wasm/CMakeLists.txt +0 -50
  581. data/ext/sources/examples/command.wasm/emscripten.cpp +0 -327
  582. data/ext/sources/examples/command.wasm/index-tmpl.html +0 -415
  583. data/ext/sources/examples/generate-karaoke.sh +0 -57
  584. data/ext/sources/examples/helpers.js +0 -191
  585. data/ext/sources/examples/livestream.sh +0 -112
  586. data/ext/sources/examples/lsp/CMakeLists.txt +0 -10
  587. data/ext/sources/examples/lsp/lsp.cpp +0 -471
  588. data/ext/sources/examples/lsp/whisper.vim +0 -362
  589. data/ext/sources/examples/python/test_whisper_processor.py +0 -7
  590. data/ext/sources/examples/python/whisper_processor.py +0 -54
  591. data/ext/sources/examples/server/bench.js +0 -29
  592. data/ext/sources/examples/server.py +0 -120
  593. data/ext/sources/examples/stream/CMakeLists.txt +0 -10
  594. data/ext/sources/examples/stream/stream.cpp +0 -437
  595. data/ext/sources/examples/stream.wasm/CMakeLists.txt +0 -49
  596. data/ext/sources/examples/stream.wasm/emscripten.cpp +0 -216
  597. data/ext/sources/examples/stream.wasm/index-tmpl.html +0 -491
  598. data/ext/sources/examples/sycl/CMakeLists.txt +0 -9
  599. data/ext/sources/examples/sycl/build.sh +0 -22
  600. data/ext/sources/examples/sycl/ls-sycl-device.cpp +0 -11
  601. data/ext/sources/examples/sycl/run-whisper.sh +0 -17
  602. data/ext/sources/examples/talk-llama/CMakeLists.txt +0 -48
  603. data/ext/sources/examples/talk-llama/eleven-labs.py +0 -80
  604. data/ext/sources/examples/talk-llama/llama-adapter.cpp +0 -488
  605. data/ext/sources/examples/talk-llama/llama-adapter.h +0 -89
  606. data/ext/sources/examples/talk-llama/llama-arch.cpp +0 -2877
  607. data/ext/sources/examples/talk-llama/llama-arch.h +0 -628
  608. data/ext/sources/examples/talk-llama/llama-batch.cpp +0 -919
  609. data/ext/sources/examples/talk-llama/llama-batch.h +0 -173
  610. data/ext/sources/examples/talk-llama/llama-chat.cpp +0 -896
  611. data/ext/sources/examples/talk-llama/llama-chat.h +0 -71
  612. data/ext/sources/examples/talk-llama/llama-context.cpp +0 -3633
  613. data/ext/sources/examples/talk-llama/llama-context.h +0 -359
  614. data/ext/sources/examples/talk-llama/llama-cparams.cpp +0 -5
  615. data/ext/sources/examples/talk-llama/llama-cparams.h +0 -47
  616. data/ext/sources/examples/talk-llama/llama-ext.h +0 -12
  617. data/ext/sources/examples/talk-llama/llama-grammar.cpp +0 -1464
  618. data/ext/sources/examples/talk-llama/llama-grammar.h +0 -194
  619. data/ext/sources/examples/talk-llama/llama-graph.cpp +0 -2735
  620. data/ext/sources/examples/talk-llama/llama-graph.h +0 -1031
  621. data/ext/sources/examples/talk-llama/llama-hparams.cpp +0 -258
  622. data/ext/sources/examples/talk-llama/llama-hparams.h +0 -353
  623. data/ext/sources/examples/talk-llama/llama-impl.cpp +0 -171
  624. data/ext/sources/examples/talk-llama/llama-impl.h +0 -75
  625. data/ext/sources/examples/talk-llama/llama-io.cpp +0 -15
  626. data/ext/sources/examples/talk-llama/llama-io.h +0 -35
  627. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +0 -330
  628. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +0 -137
  629. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2285
  630. data/ext/sources/examples/talk-llama/llama-kv-cache.h +0 -389
  631. data/ext/sources/examples/talk-llama/llama-kv-cells.h +0 -533
  632. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +0 -275
  633. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +0 -140
  634. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +0 -268
  635. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +0 -139
  636. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +0 -1165
  637. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +0 -182
  638. data/ext/sources/examples/talk-llama/llama-memory.cpp +0 -59
  639. data/ext/sources/examples/talk-llama/llama-memory.h +0 -122
  640. data/ext/sources/examples/talk-llama/llama-mmap.cpp +0 -752
  641. data/ext/sources/examples/talk-llama/llama-mmap.h +0 -73
  642. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +0 -1655
  643. data/ext/sources/examples/talk-llama/llama-model-loader.h +0 -206
  644. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +0 -299
  645. data/ext/sources/examples/talk-llama/llama-model-saver.h +0 -40
  646. data/ext/sources/examples/talk-llama/llama-model.cpp +0 -9056
  647. data/ext/sources/examples/talk-llama/llama-model.h +0 -597
  648. data/ext/sources/examples/talk-llama/llama-quant.cpp +0 -1304
  649. data/ext/sources/examples/talk-llama/llama-quant.h +0 -1
  650. data/ext/sources/examples/talk-llama/llama-sampler.cpp +0 -3885
  651. data/ext/sources/examples/talk-llama/llama-sampler.h +0 -42
  652. data/ext/sources/examples/talk-llama/llama-vocab.cpp +0 -3970
  653. data/ext/sources/examples/talk-llama/llama-vocab.h +0 -187
  654. data/ext/sources/examples/talk-llama/llama.cpp +0 -1194
  655. data/ext/sources/examples/talk-llama/llama.h +0 -1573
  656. data/ext/sources/examples/talk-llama/models/afmoe.cpp +0 -190
  657. data/ext/sources/examples/talk-llama/models/apertus.cpp +0 -125
  658. data/ext/sources/examples/talk-llama/models/arcee.cpp +0 -135
  659. data/ext/sources/examples/talk-llama/models/arctic.cpp +0 -137
  660. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +0 -86
  661. data/ext/sources/examples/talk-llama/models/baichuan.cpp +0 -123
  662. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +0 -143
  663. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +0 -133
  664. data/ext/sources/examples/talk-llama/models/bert.cpp +0 -184
  665. data/ext/sources/examples/talk-llama/models/bitnet.cpp +0 -145
  666. data/ext/sources/examples/talk-llama/models/bloom.cpp +0 -101
  667. data/ext/sources/examples/talk-llama/models/chameleon.cpp +0 -178
  668. data/ext/sources/examples/talk-llama/models/chatglm.cpp +0 -132
  669. data/ext/sources/examples/talk-llama/models/codeshell.cpp +0 -111
  670. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +0 -102
  671. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +0 -134
  672. data/ext/sources/examples/talk-llama/models/command-r.cpp +0 -122
  673. data/ext/sources/examples/talk-llama/models/dbrx.cpp +0 -122
  674. data/ext/sources/examples/talk-llama/models/deci.cpp +0 -135
  675. data/ext/sources/examples/talk-llama/models/deepseek.cpp +0 -142
  676. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +0 -262
  677. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +0 -445
  678. data/ext/sources/examples/talk-llama/models/dots1.cpp +0 -132
  679. data/ext/sources/examples/talk-llama/models/dream.cpp +0 -105
  680. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +0 -148
  681. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +0 -110
  682. data/ext/sources/examples/talk-llama/models/eurobert.cpp +0 -97
  683. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +0 -145
  684. data/ext/sources/examples/talk-llama/models/exaone.cpp +0 -114
  685. data/ext/sources/examples/talk-llama/models/exaone4.cpp +0 -123
  686. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +0 -111
  687. data/ext/sources/examples/talk-llama/models/falcon.cpp +0 -120
  688. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +0 -116
  689. data/ext/sources/examples/talk-llama/models/gemma.cpp +0 -112
  690. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +0 -128
  691. data/ext/sources/examples/talk-llama/models/gemma3.cpp +0 -155
  692. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +0 -384
  693. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +0 -170
  694. data/ext/sources/examples/talk-llama/models/glm4.cpp +0 -157
  695. data/ext/sources/examples/talk-llama/models/gpt2.cpp +0 -105
  696. data/ext/sources/examples/talk-llama/models/gptneox.cpp +0 -144
  697. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +0 -195
  698. data/ext/sources/examples/talk-llama/models/granite.cpp +0 -210
  699. data/ext/sources/examples/talk-llama/models/grok.cpp +0 -159
  700. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +0 -139
  701. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +0 -132
  702. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +0 -153
  703. data/ext/sources/examples/talk-llama/models/internlm2.cpp +0 -120
  704. data/ext/sources/examples/talk-llama/models/jais.cpp +0 -86
  705. data/ext/sources/examples/talk-llama/models/jais2.cpp +0 -123
  706. data/ext/sources/examples/talk-llama/models/jamba.cpp +0 -106
  707. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +0 -381
  708. data/ext/sources/examples/talk-llama/models/lfm2.cpp +0 -196
  709. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +0 -122
  710. data/ext/sources/examples/talk-llama/models/llada.cpp +0 -99
  711. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +0 -178
  712. data/ext/sources/examples/talk-llama/models/llama.cpp +0 -175
  713. data/ext/sources/examples/talk-llama/models/maincoder.cpp +0 -117
  714. data/ext/sources/examples/talk-llama/models/mamba-base.cpp +0 -289
  715. data/ext/sources/examples/talk-llama/models/mamba.cpp +0 -54
  716. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +0 -129
  717. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +0 -200
  718. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +0 -123
  719. data/ext/sources/examples/talk-llama/models/mistral3.cpp +0 -160
  720. data/ext/sources/examples/talk-llama/models/models.h +0 -704
  721. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +0 -109
  722. data/ext/sources/examples/talk-llama/models/mpt.cpp +0 -126
  723. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +0 -162
  724. data/ext/sources/examples/talk-llama/models/nemotron.cpp +0 -122
  725. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +0 -104
  726. data/ext/sources/examples/talk-llama/models/olmo.cpp +0 -121
  727. data/ext/sources/examples/talk-llama/models/olmo2.cpp +0 -150
  728. data/ext/sources/examples/talk-llama/models/olmoe.cpp +0 -124
  729. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +0 -127
  730. data/ext/sources/examples/talk-llama/models/openelm.cpp +0 -124
  731. data/ext/sources/examples/talk-llama/models/orion.cpp +0 -123
  732. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +0 -122
  733. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +0 -121
  734. data/ext/sources/examples/talk-llama/models/phi2.cpp +0 -121
  735. data/ext/sources/examples/talk-llama/models/phi3.cpp +0 -152
  736. data/ext/sources/examples/talk-llama/models/plamo.cpp +0 -110
  737. data/ext/sources/examples/talk-llama/models/plamo2.cpp +0 -320
  738. data/ext/sources/examples/talk-llama/models/plamo3.cpp +0 -128
  739. data/ext/sources/examples/talk-llama/models/plm.cpp +0 -169
  740. data/ext/sources/examples/talk-llama/models/qwen.cpp +0 -108
  741. data/ext/sources/examples/talk-llama/models/qwen2.cpp +0 -126
  742. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +0 -151
  743. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +0 -117
  744. data/ext/sources/examples/talk-llama/models/qwen3.cpp +0 -120
  745. data/ext/sources/examples/talk-llama/models/qwen35.cpp +0 -381
  746. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +0 -422
  747. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +0 -131
  748. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +0 -525
  749. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +0 -140
  750. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +0 -132
  751. data/ext/sources/examples/talk-llama/models/refact.cpp +0 -94
  752. data/ext/sources/examples/talk-llama/models/rnd1.cpp +0 -126
  753. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +0 -164
  754. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +0 -94
  755. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +0 -86
  756. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +0 -137
  757. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +0 -90
  758. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +0 -124
  759. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +0 -126
  760. data/ext/sources/examples/talk-llama/models/smollm3.cpp +0 -128
  761. data/ext/sources/examples/talk-llama/models/stablelm.cpp +0 -146
  762. data/ext/sources/examples/talk-llama/models/starcoder.cpp +0 -100
  763. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +0 -121
  764. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +0 -165
  765. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +0 -166
  766. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +0 -96
  767. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +0 -149
  768. data/ext/sources/examples/talk-llama/models/xverse.cpp +0 -108
  769. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +0 -23
  770. data/ext/sources/examples/talk-llama/speak +0 -40
  771. data/ext/sources/examples/talk-llama/speak.bat +0 -1
  772. data/ext/sources/examples/talk-llama/speak.ps1 +0 -14
  773. data/ext/sources/examples/talk-llama/talk-llama.cpp +0 -813
  774. data/ext/sources/examples/talk-llama/unicode-data.cpp +0 -7034
  775. data/ext/sources/examples/talk-llama/unicode-data.h +0 -20
  776. data/ext/sources/examples/talk-llama/unicode.cpp +0 -1103
  777. data/ext/sources/examples/talk-llama/unicode.h +0 -111
  778. data/ext/sources/examples/wchess/CMakeLists.txt +0 -10
  779. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +0 -19
  780. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +0 -803
  781. data/ext/sources/examples/wchess/libwchess/Chessboard.h +0 -33
  782. data/ext/sources/examples/wchess/libwchess/WChess.cpp +0 -193
  783. data/ext/sources/examples/wchess/libwchess/WChess.h +0 -63
  784. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +0 -117
  785. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +0 -8
  786. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -253
  787. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +0 -50
  788. data/ext/sources/examples/whisper.wasm/emscripten.cpp +0 -118
  789. data/ext/sources/examples/whisper.wasm/index-tmpl.html +0 -659
  790. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +0 -99
  791. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +0 -155
  792. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +0 -153
  793. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +0 -26
  794. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +0 -123
  795. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +0 -17
  796. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +0 -333
  797. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +0 -5
  798. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +0 -182
  799. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +0 -323
  800. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -718
  801. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +0 -123
  802. data/ext/sources/tests/CMakeLists.txt +0 -112
  803. data/ext/sources/tests/earnings21/eval.mk +0 -58
  804. data/ext/sources/tests/earnings21/eval.py +0 -68
  805. data/ext/sources/tests/earnings21/normalizers/__init__.py +0 -2
  806. data/ext/sources/tests/earnings21/normalizers/basic.py +0 -80
  807. data/ext/sources/tests/earnings21/normalizers/english.json +0 -1741
  808. data/ext/sources/tests/earnings21/normalizers/english.py +0 -550
  809. data/ext/sources/tests/earnings21/requirements.txt +0 -6
  810. data/ext/sources/tests/en-0-ref.txt +0 -1
  811. data/ext/sources/tests/en-1-ref.txt +0 -1
  812. data/ext/sources/tests/en-2-ref.txt +0 -1
  813. data/ext/sources/tests/es-0-ref.txt +0 -1
  814. data/ext/sources/tests/librispeech/eval.mk +0 -39
  815. data/ext/sources/tests/librispeech/eval.py +0 -47
  816. data/ext/sources/tests/librispeech/normalizers/__init__.py +0 -2
  817. data/ext/sources/tests/librispeech/normalizers/basic.py +0 -80
  818. data/ext/sources/tests/librispeech/normalizers/english.json +0 -1741
  819. data/ext/sources/tests/librispeech/normalizers/english.py +0 -550
  820. data/ext/sources/tests/librispeech/requirements.txt +0 -6
  821. data/ext/sources/tests/run-tests.sh +0 -130
  822. data/ext/sources/tests/test-c.c +0 -3
  823. data/ext/sources/tests/test-vad-full.cpp +0 -56
  824. data/ext/sources/tests/test-vad.cpp +0 -83
  825. data/ext/sources/tests/test-whisper.js +0 -58
  826. data/lib/whisper/context.rb +0 -15
  827. data/lib/whisper/segment.rb +0 -58
  828. /data/ext/sources/ggml/src/ggml-opencl/kernels/{gemv_noshuffle_general_q8_0_f32.cl → gemv_noshuffle_q8_0_f32.cl} +0 -0
@@ -28,6 +28,7 @@
28
28
  #define QK8_0 32
29
29
  #define QR8_0 1
30
30
  #define QK_K 256
31
+ #define K_SCALE_SIZE (3 * QK_K / 64)
31
32
  #define K_QUANTS_PER_ITERATION 2
32
33
 
33
34
  typedef char int8_t;
@@ -55,6 +56,46 @@ struct block_q4_1 {
55
56
  uchar qs[QK4_1 / 2]; // nibbles / quants
56
57
  };
57
58
 
59
+ //------------------------------------------------------------------------------
60
+ // block_q5_0
61
+ //------------------------------------------------------------------------------
62
+ struct block_q5_0 {
63
+ half d; // delta
64
+ uchar qh[4]; // 5-th bit of quants
65
+ uchar qs[QK5_0 / 2]; // nibbles / quants
66
+ };
67
+
68
+ //------------------------------------------------------------------------------
69
+ // block_q5_1
70
+ //------------------------------------------------------------------------------
71
+ struct block_q5_1 {
72
+ half d; // delta
73
+ half m; // min
74
+ uchar qh[4]; // 5-th bit of quants
75
+ uchar qs[QK5_1 / 2]; // nibbles / quants
76
+ };
77
+
78
+ //------------------------------------------------------------------------------
79
+ // block_q4_k
80
+ //------------------------------------------------------------------------------
81
+ struct block_q4_K {
82
+ half d; // delta
83
+ half dm; // min
84
+ uchar s[K_SCALE_SIZE];
85
+ uchar q[QK_K / 2]; // nibbles / quants
86
+ };
87
+
88
+ //------------------------------------------------------------------------------
89
+ // block_q5_k
90
+ //------------------------------------------------------------------------------
91
+ struct block_q5_K {
92
+ half d; // delta
93
+ half dm; // min
94
+ uchar s[K_SCALE_SIZE];
95
+ uchar qh[QK_K / 8];
96
+ uchar qs[QK_K / 2]; // nibbles / quants
97
+ };
98
+
58
99
  //------------------------------------------------------------------------------
59
100
  // block_q6_K
60
101
  //------------------------------------------------------------------------------
@@ -65,6 +106,59 @@ struct block_q6_K {
65
106
  half d; // super-block scale
66
107
  };
67
108
 
109
+ //------------------------------------------------------------------------------
110
+ // block_iq4_nl
111
+ //------------------------------------------------------------------------------
112
+ #define QK4_NL 32
113
+
114
+ struct block_iq4_nl
115
+ {
116
+ half d;
117
+ uint8_t qs[QK4_NL / 2];
118
+ };
119
+
120
+ //------------------------------------------------------------------------------
121
+ // bf16 to f16
122
+ //------------------------------------------------------------------------------
123
+ kernel void kernel_convert_bf16_to_f16(
124
+ global const ushort * src,
125
+ global half * dst,
126
+ ulong off_dst,
127
+ ulong n
128
+ ) {
129
+ uint i = get_global_id(0);
130
+ if (i >= n) {
131
+ return;
132
+ }
133
+
134
+ dst[i + off_dst] = (half) as_float((uint) src[i] << 16);
135
+ }
136
+
137
+ //------------------------------------------------------------------------------
138
+ // f16 to bf16
139
+ //------------------------------------------------------------------------------
140
+ kernel void kernel_convert_f16_to_bf16(
141
+ global const half * src,
142
+ ulong off_src,
143
+ global ushort * dst,
144
+ ulong n
145
+ ) {
146
+ uint i = get_global_id(0);
147
+ if (i >= n) {
148
+ return;
149
+ }
150
+
151
+ float f = (float) src[i + off_src];
152
+ uint bits = as_uint(f);
153
+ if ((bits & 0x7fffffffu) > 0x7f800000u) {
154
+ // nan to quiet nan
155
+ dst[i] = (ushort)((bits >> 16) | 0x40u);
156
+ } else {
157
+ uint rounded = bits + 0x7fffu + ((bits >> 16) & 1u);
158
+ dst[i] = (ushort)(rounded >> 16);
159
+ }
160
+ }
161
+
68
162
  //------------------------------------------------------------------------------
69
163
  // kernel_convert_block_q4_0
70
164
  // Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
@@ -157,6 +251,100 @@ kernel void kernel_restore_block_q4_0_noshuffle(
157
251
  }
158
252
  }
159
253
 
254
+ kernel void kernel_convert_block_q4_0_trans4_ns(
255
+ global struct block_q4_0 * src0,
256
+ __global uint * dst_q,
257
+ __global half * dst_d,
258
+ uint ne00,
259
+ uint ne01
260
+ ) {
261
+ uint i00 = get_global_id(1);
262
+ uint i01 = get_global_id(0);
263
+ uint i02 = get_global_id(2);
264
+
265
+ if (i01 >= ne01) {
266
+ return;
267
+ }
268
+
269
+ uint ne00_blk = ne00 / QK4_0;
270
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
271
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
272
+
273
+ global struct block_q4_0 * b = src0 + src_blk_offset;
274
+ dst_d[dst_blk_offset] = b->d;
275
+
276
+ // extract quantization and unshuffle
277
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
278
+
279
+ ushort8 post_block = (ushort8)(0);
280
+
281
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
282
+ uchar * post_block_ptr = (uchar *)(&post_block);
283
+
284
+ for (int i = 0; i < QK4_0 / 4; ++i) {
285
+ uchar x0 = pre_block_ptr[2*i + 0];
286
+ uchar x1 = pre_block_ptr[2*i + 1];
287
+
288
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
289
+ post_block_ptr[i + QK4_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
290
+ }
291
+
292
+ uint4 q_block = as_uint4(post_block);
293
+
294
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
295
+ dst_q[offset] = q_block.x;
296
+ dst_q[offset + ne01] = q_block.y;
297
+ dst_q[offset + ne01 * 2] = q_block.z;
298
+ dst_q[offset + ne01 * 3] = q_block.w;
299
+ }
300
+
301
+ kernel void kernel_restore_block_q4_0_trans4_ns(
302
+ __global uint * src_q,
303
+ __global half * src_d,
304
+ __global struct block_q4_0 * dst0,
305
+ uint ne00,
306
+ uint ne01
307
+ ) {
308
+ uint i00 = get_global_id(1);
309
+ uint i01 = get_global_id(0);
310
+ uint i02 = get_global_id(2);
311
+
312
+ if (i01 >= ne01) {
313
+ return;
314
+ }
315
+
316
+ uint ne00_blk = ne00 / QK4_0;
317
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
318
+ uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
319
+
320
+ __global struct block_q4_0 * b = dst0 + dst_blk_offset;
321
+ b->d = src_d[src_d_offset];
322
+
323
+ // collect transposed quantization parts for a block
324
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
325
+ uint4 q_block;
326
+ q_block.x = src_q[src_q_offset];
327
+ q_block.y = src_q[src_q_offset + ne01];
328
+ q_block.z = src_q[src_q_offset + ne01 * 2];
329
+ q_block.w = src_q[src_q_offset + ne01 * 3];
330
+
331
+ ushort8 post_block = as_ushort8(q_block);
332
+ ushort8 pre_block = (ushort8)(0);
333
+
334
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
335
+ uchar * post_block_ptr = (uchar *)(&post_block);
336
+
337
+ for (int i = 0; i < QK4_0 / 4; ++i) {
338
+ uchar x0 = post_block_ptr[i + 0];
339
+ uchar x1 = post_block_ptr[i + QK4_0 / 4];
340
+
341
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
342
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
343
+ }
344
+
345
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
346
+ }
347
+
160
348
  //------------------------------------------------------------------------------
161
349
  // kernel_convert_block_q4_1
162
350
  // Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
@@ -251,76 +439,60 @@ kernel void kernel_restore_block_q4_1_noshuffle(
251
439
  }
252
440
  }
253
441
 
254
- //------------------------------------------------------------------------------
255
- // block_mxfp4
256
- //------------------------------------------------------------------------------
257
- #define QK_MXFP4 32
258
- struct block_mxfp4 {
259
- uchar e; // E8M0
260
- uchar qs[QK_MXFP4 / 2];
261
- };
262
-
263
- //------------------------------------------------------------------------------
264
- // kernel_convert_block_mxfp4
265
- // Convert the block_mxfp4 format to 2 separate arrays (AOS -> SOA).
266
- // This kernel does not deshuffle the bits.
267
- //------------------------------------------------------------------------------
268
- kernel void kernel_convert_block_mxfp4(
269
- global struct block_mxfp4 * src0,
270
- global uchar * dst_q,
271
- global uchar * dst_e
272
- ) {
273
- global struct block_mxfp4 * b = (global struct block_mxfp4 *) src0 + get_global_id(0);
274
- global uchar * q = (global uchar *) dst_q + QK_MXFP4 / 2 * get_global_id(0);
275
- global uchar * e = (global uchar *) dst_e + get_global_id(0);
276
-
277
- *e = b->e;
278
-
279
- for (int i = 0; i < QK_MXFP4 / 2; ++i) {
280
- q[i] = b->qs[i];
281
- }
282
- }
283
-
284
- kernel void kernel_convert_block_mxfp4_trans(
285
- global struct block_mxfp4 * src0,
286
- __global uint4 * dst_q,
287
- __global uchar * dst_e,
442
+ kernel void kernel_convert_block_q4_1_trans4_ns(
443
+ __global struct block_q4_1 * src0,
444
+ __global uint * dst_q,
445
+ __global half * dst_d,
446
+ __global half * dst_m,
288
447
  uint ne00,
289
448
  uint ne01
290
449
  ) {
291
- int i00 = get_global_id(1);
450
+ uint i00 = get_global_id(1);
292
451
  uint i01 = get_global_id(0);
293
452
  uint i02 = get_global_id(2);
294
453
 
295
- uint ne00_blk = ne00 / QK_MXFP4;
454
+ if (i01 >= ne01) {
455
+ return;
456
+ }
457
+
458
+ uint ne00_blk = ne00 / QK4_1;
296
459
  uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
297
460
  uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
298
461
 
299
- global struct block_mxfp4 * b = src0 + src_blk_offset;
462
+ global struct block_q4_1 * b = src0 + src_blk_offset;
463
+ dst_d[dst_blk_offset] = b->d;
464
+ dst_m[dst_blk_offset] = b->m;
300
465
 
301
- dst_q[dst_blk_offset] = ((global uint4 *)(&(b->qs[0])))[0];
302
- dst_e[dst_blk_offset] = b->e;
303
- }
466
+ // extract quantization and unshuffle
467
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
304
468
 
305
- kernel void kernel_restore_block_mxfp4(
306
- global uchar * src_q,
307
- global half * src_e,
308
- global struct block_mxfp4 * dst
309
- ) {
310
- global struct block_mxfp4 * b = (global struct block_mxfp4 *) dst + get_global_id(0);
311
- global uchar * q = (global uchar *) src_q + QK_MXFP4 / 2 * get_global_id(0);
312
- global uchar * e = (global uchar *) src_e + get_global_id(0);
469
+ ushort8 post_block = (ushort8)(0);
313
470
 
314
- b->e = *e;
315
- for (int i = 0; i < QK_MXFP4 / 2; ++i) {
316
- b->qs[i] = q[i];
471
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
472
+ uchar * post_block_ptr = (uchar *)(&post_block);
473
+
474
+ for (int i = 0; i < QK4_1 / 4; ++i) {
475
+ uchar x0 = pre_block_ptr[2*i + 0];
476
+ uchar x1 = pre_block_ptr[2*i + 1];
477
+
478
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
479
+ post_block_ptr[i + QK4_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
317
480
  }
481
+
482
+ uint4 q_block = as_uint4(post_block);
483
+
484
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
485
+ dst_q[offset] = q_block.x;
486
+ dst_q[offset + ne01] = q_block.y;
487
+ dst_q[offset + ne01 * 2] = q_block.z;
488
+ dst_q[offset + ne01 * 3] = q_block.w;
318
489
  }
319
490
 
320
- kernel void kernel_restore_block_mxfp4_trans(
321
- __global uint4 * src_q,
322
- __global uchar * src_e,
323
- global struct block_mxfp4 * dst,
491
+ kernel void kernel_restore_block_q4_1_trans4_ns(
492
+ __global uint * src_q,
493
+ __global half * src_d,
494
+ __global half * src_m,
495
+ __global struct block_q4_1 * dst0,
324
496
  uint ne00,
325
497
  uint ne01
326
498
  ) {
@@ -328,99 +500,1419 @@ kernel void kernel_restore_block_mxfp4_trans(
328
500
  uint i01 = get_global_id(0);
329
501
  uint i02 = get_global_id(2);
330
502
 
331
- uint ne00_blk = ne00 / QK_MXFP4;
332
- uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
503
+ if (i01 >= ne01) {
504
+ return;
505
+ }
506
+
507
+ uint ne00_blk = ne00 / QK4_1;
333
508
  uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
509
+ uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
334
510
 
335
- global struct block_mxfp4 * b = dst + dst_blk_offset;
511
+ __global struct block_q4_1 * b = dst0 + dst_blk_offset;
512
+ b->d = src_d[src_dm_offset];
513
+ b->m = src_m[src_dm_offset];
336
514
 
337
- ((global uint4 *)(&(b->qs[0])))[0] = src_q[src_blk_offset];
338
- b->e = src_e[src_blk_offset];
515
+ // collect transposed quantization parts for a block
516
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
517
+ uint4 q_block;
518
+ q_block.x = src_q[src_q_offset];
519
+ q_block.y = src_q[src_q_offset + ne01];
520
+ q_block.z = src_q[src_q_offset + ne01 * 2];
521
+ q_block.w = src_q[src_q_offset + ne01 * 3];
522
+
523
+ ushort8 post_block = as_ushort8(q_block);
524
+ ushort8 pre_block = (ushort8)(0);
525
+
526
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
527
+ uchar * post_block_ptr = (uchar *)(&post_block);
528
+
529
+ for (int i = 0; i < QK4_0 / 4; ++i) {
530
+ uchar x0 = post_block_ptr[i + 0];
531
+ uchar x1 = post_block_ptr[i + QK4_0 / 4];
532
+
533
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
534
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
535
+ }
536
+
537
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
339
538
  }
340
539
 
341
540
  //------------------------------------------------------------------------------
342
- // block_q8_0
541
+ // kernel_convert_block_q5_0
542
+ // Convert the block_q5_0 format to 3 separate arrays (AOS -> SOA).
543
+ // This kernel does not deshuffle the bits.
343
544
  //------------------------------------------------------------------------------
344
- typedef struct {
345
- half d; // delta
346
- char qs[QK8_0]; // quants
347
- } block_q8_0;
545
+ kernel void kernel_convert_block_q5_0(
546
+ global struct block_q5_0 * src0,
547
+ global uchar * dst_qs,
548
+ global uint * dst_qh,
549
+ global half * dst_d,
550
+ ulong n_blk
551
+ ) {
552
+ if (get_global_id(0) >= n_blk) {
553
+ return;
554
+ }
348
555
 
349
- kernel void kernel_convert_block_q8_0(
350
- global block_q8_0 * src0,
556
+ global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
557
+ global uchar * qs = (global uchar *) dst_qs + (QK5_0/2)*get_global_id(0);
558
+ global uint * qh = (global uint *) dst_qh + get_global_id(0);
559
+ global half * d = (global half *) dst_d + get_global_id(0);
560
+
561
+ *d = b->d;
562
+ *qh = *((global uint *)(b->qh));
563
+
564
+ for (int i = 0; i < QK5_0/2; ++i) {
565
+ qs[i] = b->qs[i];
566
+ }
567
+ }
568
+
569
+ kernel void kernel_restore_block_q5_0(
570
+ global uchar * src_qs,
571
+ global uint * src_qh,
572
+ global half * src_d,
573
+ global struct block_q5_0 * dst
574
+ ) {
575
+ global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
576
+ global uchar * qs = (global uchar *) src_qs + (QK5_0/2)*get_global_id(0);
577
+ global uint * qh = (global uint *) src_qh + get_global_id(0);
578
+ global half * d = (global half *) src_d + get_global_id(0);
579
+
580
+ b->d = *d;
581
+ *((global uint *)(b->qh)) = *qh;
582
+ for (int i = 0; i < QK5_0/2; ++i) {
583
+ b->qs[i] = qs[i];
584
+ }
585
+ }
586
+
587
+ kernel void kernel_convert_block_q5_0_noshuffle(
588
+ global struct block_q5_0 * src0,
351
589
  global uchar * dst_q,
590
+ global uint * dst_qh,
352
591
  global half * dst_d
353
592
  ) {
354
- global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
355
- global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
356
- global half * d = (global half *) dst_d + get_global_id(0);
593
+ global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0);
594
+ global uchar * q = (global uchar *) dst_q + QK5_0/2*get_global_id(0);
595
+ global uint * qh = (global uint *) dst_qh + get_global_id(0);
596
+ global half * d = (global half *) dst_d + get_global_id(0);
357
597
 
358
598
  *d = b->d;
599
+ *qh = *((global uint *)(b->qh));
359
600
 
360
- for (int i = 0; i < QK8_0; ++i) {
361
- q[i] = b->qs[i];
601
+ for (int i = 0; i < QK5_0/4; ++i) {
602
+ uchar x0 = b->qs[2*i + 0];
603
+ uchar x1 = b->qs[2*i + 1];
604
+
605
+ q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
606
+ q[i + QK5_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
607
+
608
+ #ifdef ADRENO_GPU
609
+ if (get_global_id(0) == 65536*4096) {
610
+ printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
611
+ }
612
+ #endif
362
613
  }
363
614
  }
364
615
 
365
- kernel void kernel_restore_block_q8_0(
616
+ kernel void kernel_restore_block_q5_0_noshuffle(
366
617
  global uchar * src_q,
618
+ global uint * src_qh,
367
619
  global half * src_d,
368
- global block_q8_0 * dst
620
+ global struct block_q5_0 * dst,
621
+ uchar mask_0F,
622
+ uchar mask_F0
369
623
  ) {
370
- global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
371
- global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
372
- global half * d = (global half *) src_d + get_global_id(0);
624
+ global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0);
625
+ global uchar * q = (global uchar *) src_q + QK5_0/2*get_global_id(0);
626
+ global uint * qh = (global uint *) src_qh + get_global_id(0);
627
+ global half * d = (global half *) src_d + get_global_id(0);
373
628
 
374
629
  b->d = *d;
375
- for (int i = 0; i < QK8_0; ++i) {
376
- b->qs[i] = q[i];
630
+ *((global uint *)(b->qh)) = *qh;
631
+
632
+ for (int i = 0; i < QK5_0/4; ++i) {
633
+ uchar x0 = q[i + 0 ];
634
+ uchar x1 = q[i + QK5_0/4];
635
+
636
+ b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
637
+ b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
377
638
  }
378
639
  }
379
640
 
380
- kernel void kernel_restore_block_q8_0_trans(
381
- global uchar * src_q,
382
- global half * src_d,
383
- global block_q8_0 * dst,
641
+ kernel void kernel_convert_block_q5_0_trans4_ns(
642
+ __global struct block_q5_0 * src0,
643
+ __global uint * dst_qs,
644
+ __global uint * dst_qh,
645
+ __global half * dst_d,
384
646
  uint ne00,
385
647
  uint ne01
386
- ){
387
- uint num_blk_per_row = ne00 / QK8_0;
648
+ ) {
649
+ uint i00 = get_global_id(1);
650
+ uint i01 = get_global_id(0);
651
+ uint i02 = get_global_id(2);
388
652
 
389
- global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
390
- global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
391
- global half * d = (global half *) src_d + get_global_id(0);
653
+ if (i01 >= ne01) {
654
+ return;
655
+ }
392
656
 
393
- for (uint blk = 0; blk < num_blk_per_row; blk++) {
394
- b->d = *d;
657
+ uint ne00_blk = ne00 / QK5_0;
658
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
659
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
395
660
 
396
- for (uint i = 0; i < QK8_0; i+=4) {
397
- b->qs[i] = q[0];
398
- b->qs[i+1] = q[1];
399
- b->qs[i+2] = q[2];
400
- b->qs[i+3] = q[3];
661
+ global struct block_q5_0 * b = src0 + src_blk_offset;
662
+ dst_d[dst_blk_offset] = b->d;
401
663
 
402
- q += 4 * ne01; // M stride
403
- }
664
+ dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
404
665
 
405
- d += ne01;
666
+ // extract quantization and unshuffle
667
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
668
+ ushort8 post_block = (ushort8)(0);
406
669
 
407
- b++;
670
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
671
+ uchar * post_block_ptr = (uchar *)(&post_block);
672
+
673
+ for (int i = 0; i < QK5_0 / 4; ++i) {
674
+ uchar x0 = pre_block_ptr[2*i + 0];
675
+ uchar x1 = pre_block_ptr[2*i + 1];
676
+
677
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
678
+ post_block_ptr[i + QK5_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
408
679
  }
680
+
681
+ uint4 q_block = as_uint4(post_block);
682
+
683
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
684
+ dst_qs[offset] = q_block.x;
685
+ dst_qs[offset + ne01] = q_block.y;
686
+ dst_qs[offset + ne01 * 2] = q_block.z;
687
+ dst_qs[offset + ne01 * 3] = q_block.w;
409
688
  }
410
689
 
411
- //------------------------------------------------------------------------------
412
- // kernel_convert_block_q6_K
413
- // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
414
- // This kernel does not deshuffle the bits.
415
- // Each thread processes a super block.
416
- //------------------------------------------------------------------------------
417
- kernel void kernel_convert_block_q6_K(
418
- global struct block_q6_K * src0,
419
- global uchar * dst_ql,
420
- global uchar * dst_qh,
421
- global char * dst_s,
422
- global half * dst_d
690
+ kernel void kernel_restore_block_q5_0_trans4_ns(
691
+ __global uint * src_qs,
692
+ __global uint * src_qh,
693
+ __global half * src_d,
694
+ __global struct block_q5_0 * dst0,
695
+ uint ne00,
696
+ uint ne01
423
697
  ) {
698
+ int i00 = get_global_id(1);
699
+ uint i01 = get_global_id(0);
700
+ uint i02 = get_global_id(2);
701
+
702
+ if (i01 >= ne01) {
703
+ return;
704
+ }
705
+
706
+ uint ne00_blk = ne00 / QK5_0;
707
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
708
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
709
+
710
+ __global struct block_q5_0 * b = dst0 + dst_blk_offset;
711
+ b->d = src_d[src_blk_offset];
712
+
713
+ ((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
714
+
715
+ // collect transposed quantization parts for a block
716
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
717
+ uint4 q_block;
718
+ q_block.x = src_qs[src_q_offset];
719
+ q_block.y = src_qs[src_q_offset + ne01];
720
+ q_block.z = src_qs[src_q_offset + ne01 * 2];
721
+ q_block.w = src_qs[src_q_offset + ne01 * 3];
722
+
723
+ ushort8 post_block = as_ushort8(q_block);
724
+ ushort8 pre_block = (ushort8)(0);
725
+
726
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
727
+ uchar * post_block_ptr = (uchar *)(&post_block);
728
+
729
+ for (int i = 0; i < QK5_0 / 4; ++i) {
730
+ uchar x0 = post_block_ptr[i + 0];
731
+ uchar x1 = post_block_ptr[i + QK5_0 / 4];
732
+
733
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
734
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
735
+ }
736
+
737
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
738
+ }
739
+
740
+ //------------------------------------------------------------------------------
741
+ // kernel_convert_block_q5_1
742
+ // Convert the block_q5_1 format to 4 separate arrays (AOS -> SOA).
743
+ // This kernel does not deshuffle the bits.
744
+ //------------------------------------------------------------------------------
745
+ kernel void kernel_convert_block_q5_1(
746
+ global struct block_q5_1 * src0,
747
+ global uchar * dst_qs,
748
+ global uint * dst_qh,
749
+ global half * dst_d,
750
+ global half * dst_m,
751
+ ulong n_blk
752
+ ) {
753
+ if (get_global_id(0) >= n_blk) {
754
+ return;
755
+ }
756
+
757
+ global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
758
+ global uchar * qs = (global uchar *) dst_qs + (QK5_1/2)*get_global_id(0);
759
+ global uint * qh = (global uint *) dst_qh + get_global_id(0);
760
+ global half * d = (global half *) dst_d + get_global_id(0);
761
+ global half * m = (global half *) dst_m + get_global_id(0);
762
+
763
+ *d = b->d;
764
+ *m = b->m;
765
+ *qh = *((global uint *)(b->qh));
766
+
767
+ for (int i = 0; i < QK5_1/2; ++i) {
768
+ qs[i] = b->qs[i];
769
+ }
770
+ }
771
+
772
+ kernel void kernel_restore_block_q5_1(
773
+ global uchar * src_qs,
774
+ global uint * src_qh,
775
+ global half * src_d,
776
+ global half * src_m,
777
+ global struct block_q5_1 * dst
778
+ ) {
779
+ global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
780
+ global uchar * qs = (global uchar *) src_qs + (QK5_1/2)*get_global_id(0);
781
+ global uint * qh = (global uint *) src_qh + get_global_id(0);
782
+ global half * d = (global half *) src_d + get_global_id(0);
783
+ global half * m = (global half *) src_m + get_global_id(0);
784
+
785
+ b->d = *d;
786
+ b->m = *m;
787
+ *((global uint *)(b->qh)) = *qh;
788
+ for (int i = 0; i < QK5_1/2; ++i) {
789
+ b->qs[i] = qs[i];
790
+ }
791
+ }
792
+
793
+ kernel void kernel_convert_block_q5_1_noshuffle(
794
+ global struct block_q5_1 * src0,
795
+ global uchar * dst_q,
796
+ global uint * dst_qh,
797
+ global half * dst_d,
798
+ global half * dst_m
799
+ ) {
800
+ global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0);
801
+ global uchar * q = (global uchar *) dst_q + QK5_1/2*get_global_id(0);
802
+ global uint * qh = (global uint *) dst_qh + get_global_id(0);
803
+ global half * d = (global half *) dst_d + get_global_id(0);
804
+ global half * m = (global half *) dst_m + get_global_id(0);
805
+
806
+ *d = b->d;
807
+ *m = b->m;
808
+ *qh = *((global uint *)(b->qh));
809
+
810
+ for (int i = 0; i < QK5_1/4; ++i) {
811
+ uchar x0 = b->qs[2*i + 0];
812
+ uchar x1 = b->qs[2*i + 1];
813
+
814
+ q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
815
+ q[i + QK5_1/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
816
+
817
+ #ifdef ADRENO_GPU
818
+ if (get_global_id(0) == 65536*4096) {
819
+ printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
820
+ }
821
+ #endif
822
+ }
823
+ }
824
+
825
+ kernel void kernel_restore_block_q5_1_noshuffle(
826
+ global uchar * src_q,
827
+ global uint * src_qh,
828
+ global half * src_d,
829
+ global half * src_m,
830
+ global struct block_q5_1 * dst,
831
+ uchar mask_0F,
832
+ uchar mask_F0
833
+ ) {
834
+ global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0);
835
+ global uchar * q = (global uchar *) src_q + QK5_1/2*get_global_id(0);
836
+ global uint * qh = (global uint *) src_qh + get_global_id(0);
837
+ global half * d = (global half *) src_d + get_global_id(0);
838
+ global half * m = (global half *) src_m + get_global_id(0);
839
+
840
+ b->d = *d;
841
+ b->m = *m;
842
+ *((global uint *)(b->qh)) = *qh;
843
+
844
+ for (int i = 0; i < QK5_1/4; ++i) {
845
+ uchar x0 = q[i + 0 ];
846
+ uchar x1 = q[i + QK5_1/4];
847
+
848
+ b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
849
+ b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
850
+ }
851
+ }
852
+
853
+ kernel void kernel_convert_block_q5_1_trans4_ns(
854
+ __global struct block_q5_1 * src0,
855
+ __global uint * dst_qs,
856
+ __global uint * dst_qh,
857
+ __global half * dst_d,
858
+ __global half * dst_m,
859
+ uint ne00,
860
+ uint ne01
861
+ ) {
862
+ uint i00 = get_global_id(1);
863
+ uint i01 = get_global_id(0);
864
+ uint i02 = get_global_id(2);
865
+
866
+ if (i01 >= ne01) {
867
+ return;
868
+ }
869
+
870
+ uint ne00_blk = ne00 / QK5_1;
871
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
872
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
873
+
874
+ global struct block_q5_1 * b = src0 + src_blk_offset;
875
+ dst_d[dst_blk_offset] = b->d;
876
+ dst_m[dst_blk_offset] = b->m;
877
+
878
+ dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
879
+
880
+ // extract quantization and unshuffle
881
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
882
+ ushort8 post_block = (ushort8)(0);
883
+
884
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
885
+ uchar * post_block_ptr = (uchar *)(&post_block);
886
+
887
+ for (int i = 0; i < QK5_1 / 4; ++i) {
888
+ uchar x0 = pre_block_ptr[2*i + 0];
889
+ uchar x1 = pre_block_ptr[2*i + 1];
890
+
891
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
892
+ post_block_ptr[i + QK5_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
893
+ }
894
+
895
+ uint4 q_block = as_uint4(post_block);
896
+
897
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
898
+ dst_qs[offset] = q_block.x;
899
+ dst_qs[offset + ne01] = q_block.y;
900
+ dst_qs[offset + ne01 * 2] = q_block.z;
901
+ dst_qs[offset + ne01 * 3] = q_block.w;
902
+ }
903
+
904
+ kernel void kernel_restore_block_q5_1_trans4_ns(
905
+ __global uint * src_qs,
906
+ __global uint * src_qh,
907
+ __global half * src_d,
908
+ __global half * src_m,
909
+ __global struct block_q5_1 * dst0,
910
+ uint ne00,
911
+ uint ne01
912
+ ) {
913
+ int i00 = get_global_id(1);
914
+ uint i01 = get_global_id(0);
915
+ uint i02 = get_global_id(2);
916
+
917
+ if (i01 >= ne01) {
918
+ return;
919
+ }
920
+
921
+ uint ne00_blk = ne00 / QK5_1;
922
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
923
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
924
+
925
+ __global struct block_q5_1 * b = dst0 + dst_blk_offset;
926
+ b->d = src_d[src_blk_offset];
927
+ b->m = src_m[src_blk_offset];
928
+
929
+ ((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
930
+
931
+ // collect transposed quantization parts for a block
932
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
933
+ uint4 q_block;
934
+ q_block.x = src_qs[src_q_offset];
935
+ q_block.y = src_qs[src_q_offset + ne01];
936
+ q_block.z = src_qs[src_q_offset + ne01 * 2];
937
+ q_block.w = src_qs[src_q_offset + ne01 * 3];
938
+
939
+ ushort8 post_block = as_ushort8(q_block);
940
+ ushort8 pre_block = (ushort8)(0);
941
+
942
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
943
+ uchar * post_block_ptr = (uchar *)(&post_block);
944
+
945
+ for (int i = 0; i < QK5_1 / 4; ++i) {
946
+ uchar x0 = post_block_ptr[i + 0];
947
+ uchar x1 = post_block_ptr[i + QK5_1 / 4];
948
+
949
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
950
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
951
+ }
952
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
953
+ }
954
+
955
+ kernel void kernel_convert_block_q4_k_trans4_ns(
956
+ __global struct block_q4_K * src0,
957
+ __global uint * dst_q,
958
+ __global half * dst_d,
959
+ __global half * dst_dm,
960
+ __global uchar * dst_s,
961
+ uint ne00,
962
+ uint ne01,
963
+ uchar mask_0F,
964
+ uchar mask_F0
965
+ ) {
966
+ uint i00 = get_global_id(1);
967
+ uint i01 = get_global_id(0);
968
+ uint i02 = get_global_id(2);
969
+
970
+ if (i01 >= ne01) {
971
+ return;
972
+ }
973
+
974
+ uint ne00_blk = ne00 / QK_K;
975
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
976
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
977
+
978
+ __global struct block_q4_K * b = src0 + src_blk_offset;
979
+
980
+ dst_d [dst_blk_offset] = b->d;
981
+ dst_dm[dst_blk_offset] = b->dm;
982
+
983
+ uint4 qv[8];
984
+ uchar * qv_bytes = (uchar *)qv;
985
+ for (int i = 0; i < QK_K / 64; ++i) {
986
+ for (int j = 0; j < 16; ++j) {
987
+ uchar x0 = b->q[i*32 + 2*j];
988
+ uchar x1 = b->q[i*32 + 2*j + 1];
989
+
990
+ qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
991
+ qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
992
+ }
993
+ }
994
+
995
+ uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
996
+ #pragma unroll
997
+ for (int p = 0; p < 8; ++p) {
998
+ uint4 v = qv[p];
999
+ dst_q[base + (p * 4 + 0) * ne01] = v.x;
1000
+ dst_q[base + (p * 4 + 1) * ne01] = v.y;
1001
+ dst_q[base + (p * 4 + 2) * ne01] = v.z;
1002
+ dst_q[base + (p * 4 + 3) * ne01] = v.w;
1003
+ }
1004
+
1005
+ __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
1006
+ #pragma unroll
1007
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1008
+ s_dst[i] = b->s[i];
1009
+ }
1010
+ }
1011
+
1012
+ kernel void kernel_restore_block_q4_k_trans4_ns(
1013
+ __global uint * src_q,
1014
+ __global half * src_d,
1015
+ __global half * src_dm,
1016
+ __global uchar * src_s,
1017
+ __global struct block_q4_K * dst0,
1018
+ uint ne00,
1019
+ uint ne01,
1020
+ uchar mask_0F,
1021
+ uchar mask_F0
1022
+ ) {
1023
+ uint i00 = get_global_id(1); // block index along K
1024
+ uint i01 = get_global_id(0); // row index
1025
+ uint i02 = get_global_id(2); // batch index
1026
+
1027
+ if (i01 >= ne01) {
1028
+ return;
1029
+ }
1030
+
1031
+ uint ne00_blk = ne00 / QK_K;
1032
+
1033
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1034
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1035
+
1036
+ __global struct block_q4_K * b = dst0 + dst_blk_offset;
1037
+
1038
+ b->d = src_d[src_blk_offset];
1039
+ b->dm = src_dm[src_blk_offset];
1040
+
1041
+ __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
1042
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1043
+ b->s[i] = s_src[i];
1044
+ }
1045
+
1046
+ uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1047
+
1048
+ uint4 qv[8];
1049
+ for (int p = 0; p < 8; ++p) {
1050
+ qv[p].x = src_q[base + (p * 4 + 0) * ne01];
1051
+ qv[p].y = src_q[base + (p * 4 + 1) * ne01];
1052
+ qv[p].z = src_q[base + (p * 4 + 2) * ne01];
1053
+ qv[p].w = src_q[base + (p * 4 + 3) * ne01];
1054
+ }
1055
+
1056
+ uchar * qv_bytes = (uchar *)qv;
1057
+ for (int i = 0; i < QK_K / 64; ++i) {
1058
+ for (int j = 0; j < 16; ++j) {
1059
+ uchar lo = qv_bytes[i*32 + j];
1060
+ uchar hi = qv_bytes[i*32 + j + 16];
1061
+ b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
1062
+ b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
1063
+ }
1064
+ }
1065
+ }
1066
+
1067
+ kernel void kernel_convert_block_q5_k_trans4_ns(
1068
+ __global struct block_q5_K * src0,
1069
+ __global uint * dst_qs,
1070
+ __global uint * dst_qh,
1071
+ __global half * dst_d,
1072
+ __global half * dst_dm,
1073
+ __global uchar * dst_s,
1074
+ uint ne00,
1075
+ uint ne01,
1076
+ uchar mask_0F,
1077
+ uchar mask_F0
1078
+ ) {
1079
+ uint i00 = get_global_id(1);
1080
+ uint i01 = get_global_id(0);
1081
+ uint i02 = get_global_id(2);
1082
+
1083
+ if (i01 >= ne01) {
1084
+ return;
1085
+ }
1086
+
1087
+ uint ne00_blk = ne00 / QK_K;
1088
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1089
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1090
+
1091
+ __global struct block_q5_K * b = src0 + src_blk_offset;
1092
+
1093
+ dst_d [dst_blk_offset] = b->d;
1094
+ dst_dm[dst_blk_offset] = b->dm;
1095
+
1096
+ for (int k = 0; k < 8; k++) {
1097
+ uchar b0 = 0, b1 = 0, b2 = 0, b3 = 0;
1098
+ for (int bit = 0; bit < 8; bit++) {
1099
+ b0 |= (uchar)(((b->qh[bit] >> k) & 1) << bit);
1100
+ b1 |= (uchar)(((b->qh[8 + bit] >> k) & 1) << bit);
1101
+ b2 |= (uchar)(((b->qh[16 + bit] >> k) & 1) << bit);
1102
+ b3 |= (uchar)(((b->qh[24 + bit] >> k) & 1) << bit);
1103
+ }
1104
+ uint packed = (uint)b0 | ((uint)b1 << 8) | ((uint)b2 << 16) | ((uint)b3 << 24);
1105
+ dst_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01] = packed;
1106
+ }
1107
+
1108
+ uint4 qv[8];
1109
+ uchar * qv_bytes = (uchar *)qv;
1110
+ for (int i = 0; i < QK_K / 64; ++i) {
1111
+ for (int j = 0; j < 16; ++j) {
1112
+ uchar x0 = b->qs[i*32 + 2*j];
1113
+ uchar x1 = b->qs[i*32 + 2*j + 1];
1114
+
1115
+ qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
1116
+ qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
1117
+ }
1118
+ }
1119
+
1120
+ uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1121
+ #pragma unroll
1122
+ for (int p = 0; p < 8; ++p) {
1123
+ uint4 v = qv[p];
1124
+ dst_qs[base + (p * 4 + 0) * ne01] = v.x;
1125
+ dst_qs[base + (p * 4 + 1) * ne01] = v.y;
1126
+ dst_qs[base + (p * 4 + 2) * ne01] = v.z;
1127
+ dst_qs[base + (p * 4 + 3) * ne01] = v.w;
1128
+ }
1129
+
1130
+ __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
1131
+ #pragma unroll
1132
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1133
+ s_dst[i] = b->s[i];
1134
+ }
1135
+ }
1136
+
1137
+ kernel void kernel_restore_block_q5_k_trans4_ns(
1138
+ __global uint * src_qs,
1139
+ __global uint * src_qh,
1140
+ __global half * src_d,
1141
+ __global half * src_dm,
1142
+ __global uchar * src_s,
1143
+ __global struct block_q5_K * dst0,
1144
+ uint ne00,
1145
+ uint ne01,
1146
+ uchar mask_0F,
1147
+ uchar mask_F0
1148
+ ) {
1149
+ uint i00 = get_global_id(1); // block index along K
1150
+ uint i01 = get_global_id(0); // row index
1151
+ uint i02 = get_global_id(2); // batch index
1152
+
1153
+ if (i01 >= ne01) {
1154
+ return;
1155
+ }
1156
+
1157
+ uint ne00_blk = ne00 / QK_K;
1158
+
1159
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1160
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1161
+
1162
+ __global struct block_q5_K * b = dst0 + dst_blk_offset;
1163
+
1164
+ b->d = src_d[src_blk_offset];
1165
+ b->dm = src_dm[src_blk_offset];
1166
+
1167
+ for (int j = 0; j < 32; j++) b->qh[j] = 0;
1168
+ for (int k = 0; k < 8; k++) {
1169
+ uint packed = src_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01];
1170
+ uchar b0 = (uchar)(packed & 0xFF);
1171
+ uchar b1 = (uchar)((packed >> 8) & 0xFF);
1172
+ uchar b2 = (uchar)((packed >> 16) & 0xFF);
1173
+ uchar b3 = (uchar)((packed >> 24) & 0xFF);
1174
+ for (int bit = 0; bit < 8; bit++) {
1175
+ b->qh[bit] |= (uchar)(((b0 >> bit) & 1) << k);
1176
+ b->qh[8 + bit] |= (uchar)(((b1 >> bit) & 1) << k);
1177
+ b->qh[16 + bit] |= (uchar)(((b2 >> bit) & 1) << k);
1178
+ b->qh[24 + bit] |= (uchar)(((b3 >> bit) & 1) << k);
1179
+ }
1180
+ }
1181
+
1182
+ __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE;
1183
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1184
+ b->s[i] = s_src[i];
1185
+ }
1186
+
1187
+ uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1188
+
1189
+ uint4 qv[8];
1190
+ for (int p = 0; p < 8; ++p) {
1191
+ qv[p].x = src_qs[base + (p * 4 + 0) * ne01];
1192
+ qv[p].y = src_qs[base + (p * 4 + 1) * ne01];
1193
+ qv[p].z = src_qs[base + (p * 4 + 2) * ne01];
1194
+ qv[p].w = src_qs[base + (p * 4 + 3) * ne01];
1195
+ }
1196
+
1197
+ uchar * qv_bytes = (uchar *)qv;
1198
+ for (int i = 0; i < QK_K / 64; ++i) {
1199
+ for (int j = 0; j < 16; ++j) {
1200
+ uchar lo = qv_bytes[i*32 + j];
1201
+ uchar hi = qv_bytes[i*32 + j + 16];
1202
+ b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
1203
+ b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
1204
+ }
1205
+ }
1206
+ }
1207
+
1208
+ kernel void kernel_convert_block_q6_k_trans4_ns(
1209
+ __global struct block_q6_K * src0,
1210
+ __global uint * dst_ql,
1211
+ __global uint * dst_qh,
1212
+ __global half * dst_d,
1213
+ __global char * dst_s,
1214
+ uint ne00,
1215
+ uint ne01,
1216
+ uchar mask_0F,
1217
+ uchar mask_F0
1218
+ ) {
1219
+ uint i00 = get_global_id(1);
1220
+ uint i01 = get_global_id(0);
1221
+ uint i02 = get_global_id(2);
1222
+
1223
+ if (i01 >= ne01) {
1224
+ return;
1225
+ }
1226
+
1227
+ uint ne00_blk = ne00 / QK_K;
1228
+
1229
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1230
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1231
+
1232
+ __global struct block_q6_K * b = src0 + src_blk_offset;
1233
+
1234
+ dst_d[dst_blk_offset] = b->d;
1235
+
1236
+ uint4 qlv[8];
1237
+ uchar * qlv_bytes = (uchar *)qlv;
1238
+ for (int i = 0; i < 2; ++i) {
1239
+ for (int j = 0; j < 16; ++j) {
1240
+ uchar x0 = b->ql[i*64 + 2*j];
1241
+ uchar x1 = b->ql[i*64 + 2*j + 1];
1242
+ uchar x2 = b->ql[i*64 + 32 + 2*j];
1243
+ uchar x3 = b->ql[i*64 + 32 + 2*j + 1];
1244
+ qlv_bytes[i*64 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
1245
+ qlv_bytes[i*64 + j + 16] = convert_uchar(x2 & mask_0F) | convert_uchar((x3 & mask_0F) << 4);
1246
+ qlv_bytes[i*64 + j + 32] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
1247
+ qlv_bytes[i*64 + j + 48] = convert_uchar((x2 & mask_F0) >> 4) | convert_uchar(x3 & mask_F0);
1248
+ }
1249
+ }
1250
+
1251
+ uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1252
+
1253
+ #pragma unroll
1254
+ for (int p = 0; p < 8; ++p) {
1255
+ uint4 v = qlv[p];
1256
+ dst_ql[ql_base + (p * 4 + 0) * ne01] = v.x;
1257
+ dst_ql[ql_base + (p * 4 + 1) * ne01] = v.y;
1258
+ dst_ql[ql_base + (p * 4 + 2) * ne01] = v.z;
1259
+ dst_ql[ql_base + (p * 4 + 3) * ne01] = v.w;
1260
+ }
1261
+
1262
+ uint qhv[16] = {0};
1263
+
1264
+ for (int n = 0; n < 2; ++n) {
1265
+ for (int l = 0; l < 32; ++l) {
1266
+ uchar h = b->qh[n*32 + l];
1267
+ int u = l / 16;
1268
+ int bit_pos = (l % 16) * 2;
1269
+ qhv[(n*4 + 0)*2 + u] |= ((uint)((h >> 0) & 0x03)) << bit_pos;
1270
+ qhv[(n*4 + 1)*2 + u] |= ((uint)((h >> 2) & 0x03)) << bit_pos;
1271
+ qhv[(n*4 + 2)*2 + u] |= ((uint)((h >> 4) & 0x03)) << bit_pos;
1272
+ qhv[(n*4 + 3)*2 + u] |= ((uint)((h >> 6) & 0x03)) << bit_pos;
1273
+ }
1274
+ }
1275
+
1276
+ uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01;
1277
+
1278
+ for (int p = 0; p < 16; ++p) {
1279
+ dst_qh[qh_base + p * ne01] = qhv[p];
1280
+ }
1281
+
1282
+ __global char * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16;
1283
+ #pragma unroll
1284
+ for (int i = 0; i < 16; ++i) {
1285
+ s_dst[i] = b->scales[i];
1286
+ }
1287
+ }
1288
+
1289
+ kernel void kernel_restore_block_q6_k_trans4_ns(
1290
+ __global uint * src_ql,
1291
+ __global uint * src_qh,
1292
+ __global half * src_d,
1293
+ __global char * src_s,
1294
+ __global struct block_q6_K * dst0,
1295
+ uint ne00,
1296
+ uint ne01,
1297
+ uchar mask_0F,
1298
+ uchar mask_F0
1299
+ ) {
1300
+ uint i00 = get_global_id(1); // block index along K
1301
+ uint i01 = get_global_id(0); // row index
1302
+ uint i02 = get_global_id(2); // batch index
1303
+
1304
+ if (i01 >= ne01) {
1305
+ return;
1306
+ }
1307
+
1308
+ uint ne00_blk = ne00 / QK_K;
1309
+
1310
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1311
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1312
+
1313
+ __global struct block_q6_K * b = dst0 + dst_blk_offset;
1314
+
1315
+ b->d = src_d[src_blk_offset];
1316
+
1317
+ uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01;
1318
+ uint4 qlv[8];
1319
+ for (int p = 0; p < 8; ++p) {
1320
+ qlv[p].x = src_ql[ql_base + (p * 4 + 0) * ne01];
1321
+ qlv[p].y = src_ql[ql_base + (p * 4 + 1) * ne01];
1322
+ qlv[p].z = src_ql[ql_base + (p * 4 + 2) * ne01];
1323
+ qlv[p].w = src_ql[ql_base + (p * 4 + 3) * ne01];
1324
+ }
1325
+
1326
+ uchar * qlv_bytes = (uchar *)qlv;
1327
+ for (int i = 0; i < 2; ++i) {
1328
+ for (int j = 0; j < 16; ++j) {
1329
+ uchar lo_02 = qlv_bytes[i*64 + j];
1330
+ uchar lo_13 = qlv_bytes[i*64 + j + 16];
1331
+ uchar hi_02 = qlv_bytes[i*64 + j + 32];
1332
+ uchar hi_13 = qlv_bytes[i*64 + j + 48];
1333
+ b->ql[i*64 + 2*j] = convert_uchar((lo_02 & mask_0F) | ((hi_02 & mask_0F) << 4));
1334
+ b->ql[i*64 + 2*j + 1] = convert_uchar(((lo_02 & mask_F0) >> 4) | (hi_02 & mask_F0));
1335
+ b->ql[i*64 + 32 + 2*j] = convert_uchar((lo_13 & mask_0F) | ((hi_13 & mask_0F) << 4));
1336
+ b->ql[i*64 + 32 + 2*j + 1] = convert_uchar(((lo_13 & mask_F0) >> 4) | (hi_13 & mask_F0));
1337
+ }
1338
+ }
1339
+
1340
+ uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01;
1341
+ uint qhv[16];
1342
+ for (int p = 0; p < 16; ++p) {
1343
+ qhv[p] = src_qh[qh_base + p * ne01];
1344
+ }
1345
+
1346
+ for (int n = 0; n < 2; ++n) {
1347
+ for (int l = 0; l < 32; ++l) {
1348
+ int u = l / 16;
1349
+ int bit_pos = (l % 16) * 2;
1350
+ uchar v0 = (uchar)((qhv[(n*4 + 0)*2 + u] >> bit_pos) & 0x03);
1351
+ uchar v1 = (uchar)((qhv[(n*4 + 1)*2 + u] >> bit_pos) & 0x03);
1352
+ uchar v2 = (uchar)((qhv[(n*4 + 2)*2 + u] >> bit_pos) & 0x03);
1353
+ uchar v3 = (uchar)((qhv[(n*4 + 3)*2 + u] >> bit_pos) & 0x03);
1354
+ b->qh[n*32 + l] = v0 | (v1 << 2) | (v2 << 4) | (v3 << 6);
1355
+ }
1356
+ }
1357
+
1358
+ __global char * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16;
1359
+ for (int i = 0; i < 16; ++i) {
1360
+ b->scales[i] = s_src[i];
1361
+ }
1362
+ }
1363
+
1364
+ //------------------------------------------------------------------------------
1365
+ // block_mxfp4
1366
+ //------------------------------------------------------------------------------
1367
+ #define QK_MXFP4 32
1368
+ struct block_mxfp4 {
1369
+ uchar e; // E8M0
1370
+ uchar qs[QK_MXFP4 / 2];
1371
+ };
1372
+
1373
+ //------------------------------------------------------------------------------
1374
+ // kernel_convert_block_mxfp4
1375
+ // Convert the block_mxfp4 format to 2 separate arrays (AOS -> SOA).
1376
+ // This kernel does not deshuffle the bits.
1377
+ //------------------------------------------------------------------------------
1378
+ kernel void kernel_convert_block_mxfp4(
1379
+ global struct block_mxfp4 * src0,
1380
+ global uchar * dst_q,
1381
+ global uchar * dst_e
1382
+ ) {
1383
+ global struct block_mxfp4 * b = (global struct block_mxfp4 *) src0 + get_global_id(0);
1384
+ global uchar * q = (global uchar *) dst_q + QK_MXFP4 / 2 * get_global_id(0);
1385
+ global uchar * e = (global uchar *) dst_e + get_global_id(0);
1386
+
1387
+ *e = b->e;
1388
+
1389
+ for (int i = 0; i < QK_MXFP4 / 2; ++i) {
1390
+ q[i] = b->qs[i];
1391
+ }
1392
+ }
1393
+
1394
+ kernel void kernel_convert_block_mxfp4_trans(
1395
+ global struct block_mxfp4 * src0,
1396
+ __global uint4 * dst_q,
1397
+ __global uchar * dst_e,
1398
+ uint ne00,
1399
+ uint ne01
1400
+ ) {
1401
+ int i00 = get_global_id(1);
1402
+ uint i01 = get_global_id(0);
1403
+ uint i02 = get_global_id(2);
1404
+
1405
+ uint ne00_blk = ne00 / QK_MXFP4;
1406
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1407
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1408
+
1409
+ global struct block_mxfp4 * b = src0 + src_blk_offset;
1410
+
1411
+ dst_q[dst_blk_offset] = ((global uint4 *)(&(b->qs[0])))[0];
1412
+ dst_e[dst_blk_offset] = b->e;
1413
+ }
1414
+
1415
+ kernel void kernel_restore_block_mxfp4(
1416
+ global uchar * src_q,
1417
+ global half * src_e,
1418
+ global struct block_mxfp4 * dst
1419
+ ) {
1420
+ global struct block_mxfp4 * b = (global struct block_mxfp4 *) dst + get_global_id(0);
1421
+ global uchar * q = (global uchar *) src_q + QK_MXFP4 / 2 * get_global_id(0);
1422
+ global uchar * e = (global uchar *) src_e + get_global_id(0);
1423
+
1424
+ b->e = *e;
1425
+ for (int i = 0; i < QK_MXFP4 / 2; ++i) {
1426
+ b->qs[i] = q[i];
1427
+ }
1428
+ }
1429
+
1430
+ kernel void kernel_restore_block_mxfp4_trans(
1431
+ __global uint4 * src_q,
1432
+ __global uchar * src_e,
1433
+ global struct block_mxfp4 * dst,
1434
+ uint ne00,
1435
+ uint ne01
1436
+ ) {
1437
+ int i00 = get_global_id(1);
1438
+ uint i01 = get_global_id(0);
1439
+ uint i02 = get_global_id(2);
1440
+
1441
+ uint ne00_blk = ne00 / QK_MXFP4;
1442
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1443
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1444
+
1445
+ global struct block_mxfp4 * b = dst + dst_blk_offset;
1446
+
1447
+ ((global uint4 *)(&(b->qs[0])))[0] = src_q[src_blk_offset];
1448
+ b->e = src_e[src_blk_offset];
1449
+ }
1450
+
1451
+ kernel void kernel_convert_block_mxfp4_trans4_ns(
1452
+ global struct block_mxfp4 * src0,
1453
+ __global uint * dst_q,
1454
+ __global uchar * dst_e,
1455
+ uint ne00,
1456
+ uint ne01
1457
+ ) {
1458
+ uint i00 = get_global_id(1);
1459
+ uint i01 = get_global_id(0);
1460
+ uint i02 = get_global_id(2);
1461
+
1462
+ if (i01 >= ne01) {
1463
+ return;
1464
+ }
1465
+
1466
+ uint ne00_blk = ne00 / QK_MXFP4;
1467
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1468
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1469
+
1470
+ global struct block_mxfp4 * b = src0 + src_blk_offset;
1471
+ dst_e[dst_blk_offset] = b->e;
1472
+
1473
+ // extract quantization and unshuffle
1474
+ ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
1475
+
1476
+ ushort8 post_block = (ushort8)(0);
1477
+
1478
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
1479
+ uchar * post_block_ptr = (uchar *)(&post_block);
1480
+
1481
+ for (int i = 0; i < QK_MXFP4 / 4; ++i) {
1482
+ uchar x0 = pre_block_ptr[2*i + 0];
1483
+ uchar x1 = pre_block_ptr[2*i + 1];
1484
+
1485
+ post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
1486
+ post_block_ptr[i + QK_MXFP4 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
1487
+ }
1488
+
1489
+ uint4 q_block = as_uint4(post_block);
1490
+
1491
+ uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
1492
+ dst_q[offset] = q_block.x;
1493
+ dst_q[offset + ne01] = q_block.y;
1494
+ dst_q[offset + ne01 * 2] = q_block.z;
1495
+ dst_q[offset + ne01 * 3] = q_block.w;
1496
+ }
1497
+
1498
+ kernel void kernel_restore_block_mxfp4_trans4_ns(
1499
+ __global uint * src_q,
1500
+ __global uchar * src_e,
1501
+ __global struct block_mxfp4 * dst0,
1502
+ uint ne00,
1503
+ uint ne01
1504
+ ) {
1505
+ uint i00 = get_global_id(1);
1506
+ uint i01 = get_global_id(0);
1507
+ uint i02 = get_global_id(2);
1508
+
1509
+ if (i01 >= ne01) {
1510
+ return;
1511
+ }
1512
+
1513
+ uint ne00_blk = ne00 / QK_MXFP4;
1514
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
1515
+ uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
1516
+
1517
+ __global struct block_mxfp4 * b = dst0 + dst_blk_offset;
1518
+ b->e = src_e[src_d_offset];
1519
+
1520
+ // collect transposed quantization parts for a block
1521
+ uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
1522
+ uint4 q_block;
1523
+ q_block.x = src_q[src_q_offset];
1524
+ q_block.y = src_q[src_q_offset + ne01];
1525
+ q_block.z = src_q[src_q_offset + ne01 * 2];
1526
+ q_block.w = src_q[src_q_offset + ne01 * 3];
1527
+
1528
+ ushort8 post_block = as_ushort8(q_block);
1529
+ ushort8 pre_block = (ushort8)(0);
1530
+
1531
+ uchar * pre_block_ptr = (uchar *)(&pre_block);
1532
+ uchar * post_block_ptr = (uchar *)(&post_block);
1533
+
1534
+ for (int i = 0; i < QK_MXFP4 / 4; ++i) {
1535
+ uchar x0 = post_block_ptr[i + 0];
1536
+ uchar x1 = post_block_ptr[i + QK_MXFP4 / 4];
1537
+
1538
+ pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
1539
+ pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
1540
+ }
1541
+
1542
+ ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
1543
+ }
1544
+
1545
+
1546
+ //------------------------------------------------------------------------------
1547
+ // block_q8_0
1548
+ //------------------------------------------------------------------------------
1549
+ typedef struct {
1550
+ half d; // delta
1551
+ char qs[QK8_0]; // quants
1552
+ } block_q8_0;
1553
+
1554
+ kernel void kernel_convert_block_q8_0(
1555
+ global block_q8_0 * src0,
1556
+ global uchar * dst_q,
1557
+ global half * dst_d
1558
+ ) {
1559
+ global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
1560
+ global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
1561
+ global half * d = (global half *) dst_d + get_global_id(0);
1562
+
1563
+ *d = b->d;
1564
+
1565
+ for (int i = 0; i < QK8_0; ++i) {
1566
+ q[i] = b->qs[i];
1567
+ }
1568
+ }
1569
+
1570
+ kernel void kernel_restore_block_q8_0(
1571
+ global uchar * src_q,
1572
+ global half * src_d,
1573
+ global block_q8_0 * dst
1574
+ ) {
1575
+ global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
1576
+ global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
1577
+ global half * d = (global half *) src_d + get_global_id(0);
1578
+
1579
+ b->d = *d;
1580
+ for (int i = 0; i < QK8_0; ++i) {
1581
+ b->qs[i] = q[i];
1582
+ }
1583
+ }
1584
+
1585
+ kernel void kernel_restore_block_q8_0_trans(
1586
+ global uchar * src_q,
1587
+ global half * src_d,
1588
+ global block_q8_0 * dst,
1589
+ uint ne00,
1590
+ uint ne01
1591
+ ){
1592
+ uint num_blk_per_row = ne00 / QK8_0;
1593
+
1594
+ global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
1595
+ global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
1596
+ global half * d = (global half *) src_d + get_global_id(0);
1597
+
1598
+ for (uint blk = 0; blk < num_blk_per_row; blk++) {
1599
+ b->d = *d;
1600
+
1601
+ for (uint i = 0; i < QK8_0; i+=4) {
1602
+ b->qs[i] = q[0];
1603
+ b->qs[i+1] = q[1];
1604
+ b->qs[i+2] = q[2];
1605
+ b->qs[i+3] = q[3];
1606
+
1607
+ q += 4 * ne01; // M stride
1608
+ }
1609
+
1610
+ d += ne01;
1611
+
1612
+ b++;
1613
+ }
1614
+ }
1615
+
1616
+ //------------------------------------------------------------------------------
1617
+ // kernel_convert_block_q4_K
1618
+ // Convert the block_q4_K format to 4 separate arrays (AOS -> SOA).
1619
+ // This kernel does not deshuffle the bits.
1620
+ // Each thread processes a super block.
1621
+ // Mask args are just to keep the signature consistent with the no-shuffle
1622
+ // version and they are not used in this kernel.
1623
+ //------------------------------------------------------------------------------
1624
+ kernel void kernel_convert_block_q4_K(
1625
+ global struct block_q4_K * src0,
1626
+ global uchar * dst_q,
1627
+ global uchar * dst_s,
1628
+ global half * dst_d,
1629
+ global half * dst_dm,
1630
+ uchar mask_0F,
1631
+ uchar mask_F0
1632
+ ) {
1633
+ global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
1634
+ global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
1635
+ global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0);
1636
+ global half * d = (global half *) dst_d + get_global_id(0);
1637
+ global half * dm = (global half *) dst_dm + get_global_id(0);
1638
+
1639
+ *d = b->d;
1640
+ *dm = b->dm;
1641
+
1642
+ for (int i = 0; i < QK_K/2; ++i) {
1643
+ q[i] = b->q[i];
1644
+ }
1645
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1646
+ s[i] = b->s[i];
1647
+ }
1648
+ }
1649
+
1650
+ // Restore block_q4_K from flattened arrays.
1651
+ // Each thread processes a super block.
1652
+ // Mask args are just to keep the signature consistent with the no-shuffle ones.
1653
+ kernel void kernel_restore_block_q4_K(
1654
+ global uchar * src_q,
1655
+ global uchar * src_s,
1656
+ global half * src_d,
1657
+ global half * src_dm,
1658
+ global struct block_q4_K * dst,
1659
+ uchar mask_0F,
1660
+ uchar mask_F0
1661
+ ) {
1662
+ global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
1663
+ global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
1664
+ global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0);
1665
+ global half * d = (global half *) src_d + get_global_id(0);
1666
+ global half * dm = (global half *) src_dm + get_global_id(0);
1667
+
1668
+ b->d = *d;
1669
+ b->dm = *dm;
1670
+
1671
+ for (int i = 0; i < QK_K/2; ++i) {
1672
+ b->q[i] = q[i];
1673
+ }
1674
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1675
+ b->s[i] = s[i];
1676
+ }
1677
+ }
1678
+
1679
+ kernel void kernel_convert_block_q4_K_noshuffle(
1680
+ global struct block_q4_K * src0,
1681
+ global uchar * dst_q,
1682
+ global uchar * dst_s,
1683
+ global half * dst_d,
1684
+ global half * dst_dm,
1685
+ uchar mask_0F,
1686
+ uchar mask_F0
1687
+ ) {
1688
+ global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0);
1689
+ global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0);
1690
+ global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0);
1691
+ global half * d = (global half *) dst_d + get_global_id(0);
1692
+ global half * dm = (global half *) dst_dm + get_global_id(0);
1693
+
1694
+ *d = b->d;
1695
+ *dm = b->dm;
1696
+
1697
+ for (int i = 0; i < QK_K / 64; ++i) {
1698
+ for (int j = 0; j < 16; ++j) {
1699
+ uchar x0 = b->q[i*32 + 2*j];
1700
+ uchar x1 = b->q[i*32 + 2*j + 1];
1701
+ q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
1702
+ q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
1703
+ }
1704
+ }
1705
+
1706
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1707
+ s[i] = b->s[i];
1708
+ }
1709
+ }
1710
+
1711
+ kernel void kernel_restore_block_q4_K_noshuffle(
1712
+ global uchar * src_q,
1713
+ global uchar * src_s,
1714
+ global half * src_d,
1715
+ global half * src_dm,
1716
+ global struct block_q4_K * dst,
1717
+ uchar mask_0F,
1718
+ uchar mask_F0
1719
+ ) {
1720
+ global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0);
1721
+ global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0);
1722
+ global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0);
1723
+ global half * d = (global half *) src_d + get_global_id(0);
1724
+ global half * dm = (global half *) src_dm + get_global_id(0);
1725
+
1726
+ b->d = *d;
1727
+ b->dm = *dm;
1728
+
1729
+ for (int i = 0; i < QK_K / 64; ++i) {
1730
+ for (int j = 0; j < 16; ++j) {
1731
+ uchar lo = q[i*32 + j];
1732
+ uchar hi = q[i*32 + j + 16];
1733
+ b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
1734
+ b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
1735
+ }
1736
+ }
1737
+
1738
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1739
+ b->s[i] = s[i];
1740
+ }
1741
+ }
1742
+
1743
+ //------------------------------------------------------------------------------
1744
+ // kernel_convert_block_q5_K
1745
+ // Convert the block_q5_K format to 5 separate arrays (AOS -> SOA).
1746
+ // Each thread processes a super block.
1747
+ //------------------------------------------------------------------------------
1748
+ kernel void kernel_convert_block_q5_K(
1749
+ global struct block_q5_K * src0,
1750
+ global uchar * dst_q,
1751
+ global uchar * dst_qh,
1752
+ global uchar * dst_s,
1753
+ global half * dst_d,
1754
+ global half * dst_dm,
1755
+ uchar mask_0F,
1756
+ uchar mask_F0
1757
+ ) {
1758
+ global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
1759
+ global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
1760
+ global uchar * qh = (global uchar *) dst_qh + QK_K/8*get_global_id(0);
1761
+ global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0);
1762
+ global half * d = (global half *) dst_d + get_global_id(0);
1763
+ global half * dm = (global half *) dst_dm + get_global_id(0);
1764
+
1765
+ *d = b->d;
1766
+ *dm = b->dm;
1767
+
1768
+ for (int i = 0; i < QK_K/2; ++i) {
1769
+ q[i] = b->qs[i];
1770
+ }
1771
+ for (int i = 0; i < QK_K/8; ++i) {
1772
+ qh[i] = b->qh[i];
1773
+ }
1774
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1775
+ s[i] = b->s[i];
1776
+ }
1777
+ }
1778
+
1779
+ // Restore block_q5_K from flattened arrays.
1780
+ // Each thread processes a super block.
1781
+ kernel void kernel_restore_block_q5_K(
1782
+ global uchar * src_q,
1783
+ global uchar * src_qh,
1784
+ global uchar * src_s,
1785
+ global half * src_d,
1786
+ global half * src_dm,
1787
+ global struct block_q5_K * dst,
1788
+ uchar mask_0F,
1789
+ uchar mask_F0
1790
+ ) {
1791
+ global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
1792
+ global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
1793
+ global uchar * qh = (global uchar *) src_qh + QK_K/8*get_global_id(0);
1794
+ global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0);
1795
+ global half * d = (global half *) src_d + get_global_id(0);
1796
+ global half * dm = (global half *) src_dm + get_global_id(0);
1797
+
1798
+ b->d = *d;
1799
+ b->dm = *dm;
1800
+
1801
+ for (int i = 0; i < QK_K/2; ++i) {
1802
+ b->qs[i] = q[i];
1803
+ }
1804
+ for (int i = 0; i < QK_K/8; ++i) {
1805
+ b->qh[i] = qh[i];
1806
+ }
1807
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1808
+ b->s[i] = s[i];
1809
+ }
1810
+ }
1811
+
1812
+ kernel void kernel_convert_block_q5_K_noshuffle(
1813
+ global struct block_q5_K * src0,
1814
+ global uchar * dst_q,
1815
+ global uchar * dst_qh,
1816
+ global uchar * dst_s,
1817
+ global half * dst_d,
1818
+ global half * dst_dm,
1819
+ uchar mask_0F,
1820
+ uchar mask_F0
1821
+ ) {
1822
+ global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
1823
+ global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0);
1824
+ global uchar * qh = (global uchar *) dst_qh + QK_K/8 * get_global_id(0);
1825
+ global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0);
1826
+ global half * d = (global half *) dst_d + get_global_id(0);
1827
+ global half * dm = (global half *) dst_dm + get_global_id(0);
1828
+
1829
+ *d = b->d;
1830
+ *dm = b->dm;
1831
+
1832
+ for (int i = 0; i < QK_K / 64; ++i) {
1833
+ for (int j = 0; j < 16; ++j) {
1834
+ uchar x0 = b->qs[i*32 + 2*j];
1835
+ uchar x1 = b->qs[i*32 + 2*j + 1];
1836
+ q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
1837
+ q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
1838
+ }
1839
+ }
1840
+
1841
+ for (int l = 0; l < QK_K/8; ++l) {
1842
+ uchar x0 = 0;
1843
+ for (int i = 0; i < 8; ++i) {
1844
+ x0 |= ((b->qh[(l%4)*8+i] >> (l/4)) & 0x01) << i;
1845
+ }
1846
+ qh[l] = x0;
1847
+ }
1848
+
1849
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1850
+ s[i] = b->s[i];
1851
+ }
1852
+ }
1853
+
1854
+ kernel void kernel_restore_block_q5_K_noshuffle(
1855
+ global uchar * src_q,
1856
+ global uchar * src_qh,
1857
+ global uchar * src_s,
1858
+ global half * src_d,
1859
+ global half * src_dm,
1860
+ global struct block_q5_K * dst,
1861
+ uchar mask_0F,
1862
+ uchar mask_F0
1863
+ ) {
1864
+ global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
1865
+ global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0);
1866
+ global uchar * qh = (global uchar *) src_qh + QK_K/8 * get_global_id(0);
1867
+ global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0);
1868
+ global half * d = (global half *) src_d + get_global_id(0);
1869
+ global half * dm = (global half *) src_dm + get_global_id(0);
1870
+
1871
+ b->d = *d;
1872
+ b->dm = *dm;
1873
+
1874
+ for (int i = 0; i < QK_K / 64; ++i) {
1875
+ for (int j = 0; j < 16; ++j) {
1876
+ uchar lo = q[i*32 + j];
1877
+ uchar hi = q[i*32 + j + 16];
1878
+ b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
1879
+ b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
1880
+ }
1881
+ }
1882
+
1883
+ for (int g = 0; g < 4; ++g) {
1884
+ for (int i = 0; i < 8; ++i) {
1885
+ uchar x0 = 0;
1886
+ for (int k = 0; k < 8; ++k) {
1887
+ x0 |= ((qh[4*k+g] >> i) & 0x01) << k;
1888
+ }
1889
+ b->qh[g*8+i] = x0;
1890
+ }
1891
+ }
1892
+
1893
+ for (int i = 0; i < K_SCALE_SIZE; ++i) {
1894
+ b->s[i] = s[i];
1895
+ }
1896
+ }
1897
+
1898
+ //------------------------------------------------------------------------------
1899
+ // kernel_convert_block_q6_K
1900
+ // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
1901
+ // This kernel does not deshuffle the bits.
1902
+ // Each thread processes a super block.
1903
+ //------------------------------------------------------------------------------
1904
+ kernel void kernel_convert_block_q6_K(
1905
+ global struct block_q6_K * src0,
1906
+ global uchar * dst_ql,
1907
+ global uchar * dst_qh,
1908
+ global char * dst_s,
1909
+ global half * dst_d,
1910
+ uchar mask_lsb_8,
1911
+ ulong n_blk
1912
+ ) {
1913
+ if (get_global_id(0) >= n_blk) {
1914
+ return;
1915
+ }
424
1916
  global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
425
1917
  global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
426
1918
  global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
@@ -447,8 +1939,13 @@ kernel void kernel_restore_block_q6_K(
447
1939
  global uchar * dst_qh,
448
1940
  global char * dst_s,
449
1941
  global half * dst_d,
450
- global struct block_q6_K * dst
1942
+ global struct block_q6_K * dst,
1943
+ uchar mask_lsb_8,
1944
+ ulong n_blk
451
1945
  ) {
1946
+ if (get_global_id(0) >= n_blk) {
1947
+ return;
1948
+ }
452
1949
  global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
453
1950
  global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
454
1951
  global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
@@ -467,3 +1964,213 @@ kernel void kernel_restore_block_q6_K(
467
1964
  b->scales[i] = s[i];
468
1965
  }
469
1966
  }
1967
+
1968
+ kernel void kernel_convert_block_q6_K_noshuffle(
1969
+ global struct block_q6_K * src0,
1970
+ global uchar * dst_ql,
1971
+ global uchar * dst_qh,
1972
+ global char * dst_s,
1973
+ global half * dst_d,
1974
+ uchar mask_lsb_8,
1975
+ ulong n_blk
1976
+ ) {
1977
+ if (get_global_id(0) >= n_blk) {
1978
+ return;
1979
+ }
1980
+ global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
1981
+ global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
1982
+ global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
1983
+ global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
1984
+ global half * d = (global half *) dst_d + get_global_id(0);
1985
+
1986
+ *d = b->d;
1987
+
1988
+ for (int i = 0; i < QK_K/2/4; ++i) {
1989
+ uchar x0 = b->ql[i*2 + 0] & mask_lsb_8;
1990
+ uchar x1 = b->ql[i*2 + 1] & mask_lsb_8;
1991
+ ql[i + 0] = (x0 & 0x0F) | ((x1 & 0x0F) << 4);
1992
+ ql[i + 32] = ((x0 & 0xF0) >> 4) | (x1 & 0xF0);
1993
+
1994
+ uchar x2 = b->ql[i*2 + 0 + 64] & mask_lsb_8;
1995
+ uchar x3 = b->ql[i*2 + 1 + 64] & mask_lsb_8;
1996
+ ql[i + 64] = (x2 & 0x0F) | ((x3 & 0x0F) << 4);
1997
+ ql[i + 96] = ((x2 & 0xF0) >> 4) | (x3 & 0xF0);
1998
+ }
1999
+
2000
+ for (int i = 0; i < QK_K/4/8; ++i) {
2001
+ uchar x0 = b->qh[i*4 + 0] & mask_lsb_8;
2002
+ uchar x1 = b->qh[i*4 + 1] & mask_lsb_8;
2003
+ uchar x2 = b->qh[i*4 + 2] & mask_lsb_8;
2004
+ uchar x3 = b->qh[i*4 + 3] & mask_lsb_8;
2005
+ qh[i + 0] = (x0 & 0x03) | ((x1 & 0x03) << 2) | ((x2 & 0x03) << 4) | ((x3 & 0x03) << 6);
2006
+ qh[i + 8] = ((x0 & 0x0C) >> 2) | (x1 & 0x0C) | ((x2 & 0x0C) << 2) | ((x3 & 0x0C) << 4);
2007
+ qh[i + 16] = ((x0 & 0x30) >> 4) | ((x1 & 0x30) >> 2) | (x2 & 0x30) | ((x3 & 0x30) << 2);
2008
+ qh[i + 24] = ((x0 & 0xC0) >> 6) | ((x1 & 0xC0) >> 4) | ((x2 & 0xC0) >> 2) | (x3 & 0xC0);
2009
+
2010
+ uchar x4 = b->qh[i*4 + 0 + 32] & mask_lsb_8;
2011
+ uchar x5 = b->qh[i*4 + 1 + 32] & mask_lsb_8;
2012
+ uchar x6 = b->qh[i*4 + 2 + 32] & mask_lsb_8;
2013
+ uchar x7 = b->qh[i*4 + 3 + 32] & mask_lsb_8;
2014
+ qh[i + 32] = (x4 & 0x03) | ((x5 & 0x03) << 2) | ((x6 & 0x03) << 4) | ((x7 & 0x03) << 6);
2015
+ qh[i + 40] = ((x4 & 0x0C) >> 2) | (x5 & 0x0C) | ((x6 & 0x0C) << 2) | ((x7 & 0x0C) << 4);
2016
+ qh[i + 48] = ((x4 & 0x30) >> 4) | ((x5 & 0x30) >> 2) | (x6 & 0x30) | ((x7 & 0x30) << 2);
2017
+ qh[i + 56] = ((x4 & 0xC0) >> 6) | ((x5 & 0xC0) >> 4) | ((x6 & 0xC0) >> 2) | (x7 & 0xC0);
2018
+ }
2019
+
2020
+ for (int i = 0; i < QK_K/16; ++i) {
2021
+ s[i] = b->scales[i];
2022
+ }
2023
+ }
2024
+
2025
+ kernel void kernel_restore_block_q6_K_noshuffle(
2026
+ global uchar * src_ql,
2027
+ global uchar * src_qh,
2028
+ global char * src_s,
2029
+ global half * src_d,
2030
+ global struct block_q6_K * dst,
2031
+ uchar mask_lsb_8,
2032
+ ulong n_blk
2033
+ ) {
2034
+ if (get_global_id(0) >= n_blk) {
2035
+ return;
2036
+ }
2037
+ global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
2038
+ global uchar * ql = (global uchar *) src_ql + QK_K/2*get_global_id(0);
2039
+ global uchar * qh = (global uchar *) src_qh + QK_K/4*get_global_id(0);
2040
+ global char * s = (global char *) src_s + QK_K/16*get_global_id(0);
2041
+ global half * d = (global half *) src_d + get_global_id(0);
2042
+
2043
+ b->d = *d;
2044
+
2045
+ for (int i = 0; i < QK_K/2/4; ++i) {
2046
+ uchar x0 = ql[i + 0] & mask_lsb_8;
2047
+ uchar x1 = ql[i + 32] & mask_lsb_8;
2048
+ b->ql[i*2 + 0] = (x0 & 0x0F) | ((x1 & 0x0F) << 4);
2049
+ b->ql[i*2 + 1] = ((x0 & 0xF0) >> 4) | (x1 & 0xF0);
2050
+
2051
+ uchar x2 = ql[i + 64] & mask_lsb_8;
2052
+ uchar x3 = ql[i + 96] & mask_lsb_8;
2053
+ b->ql[i*2 + 0 + 64] = (x2 & 0x0F) | ((x3 & 0x0F) << 4);
2054
+ b->ql[i*2 + 1 + 64] = ((x2 & 0xF0) >> 4) | (x3 & 0xF0);
2055
+ }
2056
+
2057
+ for (int i = 0; i < QK_K/4/8; ++i) {
2058
+ uchar x0 = qh[i + 0] & mask_lsb_8;
2059
+ uchar x1 = qh[i + 8] & mask_lsb_8;
2060
+ uchar x2 = qh[i + 16] & mask_lsb_8;
2061
+ uchar x3 = qh[i + 24] & mask_lsb_8;
2062
+ b->qh[i*4 + 0] = (x0 & 0x03) | ((x1 & 0x03) << 2) | ((x2 & 0x03) << 4) | ((x3 & 0x03) << 6);
2063
+ b->qh[i*4 + 1] = ((x0 & 0x0C) >> 2) | (x1 & 0x0C) | ((x2 & 0x0C) << 2) | ((x3 & 0x0C) << 4);
2064
+ b->qh[i*4 + 2] = ((x0 & 0x30) >> 4) | ((x1 & 0x30) >> 2) | (x2 & 0x30) | ((x3 & 0x30) << 2);
2065
+ b->qh[i*4 + 3] = ((x0 & 0xC0) >> 6) | ((x1 & 0xC0) >> 4) | ((x2 & 0xC0) >> 2) | (x3 & 0xC0);
2066
+
2067
+ uchar x4 = qh[i + 0 + 32] & mask_lsb_8;
2068
+ uchar x5 = qh[i + 8 + 32] & mask_lsb_8;
2069
+ uchar x6 = qh[i + 16 + 32] & mask_lsb_8;
2070
+ uchar x7 = qh[i + 24 + 32] & mask_lsb_8;
2071
+ b->qh[i*4 + 0 + 32] = (x4 & 0x03) | ((x5 & 0x03) << 2) | ((x6 & 0x03) << 4) | ((x7 & 0x03) << 6);
2072
+ b->qh[i*4 + 1 + 32] = ((x4 & 0x0C) >> 2) | (x5 & 0x0C) | ((x6 & 0x0C) << 2) | ((x7 & 0x0C) << 4);
2073
+ b->qh[i*4 + 2 + 32] = ((x4 & 0x30) >> 4) | ((x5 & 0x30) >> 2) | (x6 & 0x30) | ((x7 & 0x30) << 2);
2074
+ b->qh[i*4 + 3 + 32] = ((x4 & 0xC0) >> 6) | ((x5 & 0xC0) >> 4) | ((x6 & 0xC0) >> 2) | (x7 & 0xC0);
2075
+ }
2076
+
2077
+ for (int i = 0; i < QK_K/16; ++i) {
2078
+ b->scales[i] = s[i];
2079
+ }
2080
+ }
2081
+
2082
+ //------------------------------------------------------------------------------
2083
+ // kernel_convert_block_iq4_nl
2084
+ // Convert the block_iq4_nl format to 2 separate arrays (AOS -> SOA).
2085
+ //------------------------------------------------------------------------------
2086
+ kernel void kernel_convert_block_iq4_nl(
2087
+ global struct block_iq4_nl * src0,
2088
+ global uchar * dst_q,
2089
+ global half * dst_d,
2090
+ uchar mask_0F,
2091
+ uchar mask_F0,
2092
+ ulong n_blk
2093
+ ) {
2094
+ if (get_global_id(0) >= n_blk) {
2095
+ return;
2096
+ }
2097
+ global struct block_iq4_nl * b = (global struct block_iq4_nl *) src0 + get_global_id(0);
2098
+ global uchar * q = (global uchar *) dst_q + QK4_NL/2*get_global_id(0);
2099
+ global half * d = (global half *) dst_d + get_global_id(0);
2100
+
2101
+ *d = b->d;
2102
+
2103
+ for (int i = 0; i < QK4_NL/2; ++i) {
2104
+ q[i] = b->qs[i];
2105
+ }
2106
+ }
2107
+
2108
+ kernel void kernel_restore_block_iq4_nl(
2109
+ global uchar * src_q,
2110
+ global half * src_d,
2111
+ global struct block_iq4_nl * dst,
2112
+ ulong n_blk
2113
+ ) {
2114
+ if (get_global_id(0) >= n_blk) {
2115
+ return;
2116
+ }
2117
+ global struct block_iq4_nl * b = (global struct block_iq4_nl *) dst + get_global_id(0);
2118
+ global uchar * q = (global uchar *) src_q + QK4_NL/2*get_global_id(0);
2119
+ global half * d = (global half *) src_d + get_global_id(0);
2120
+
2121
+ b->d = *d;
2122
+
2123
+ for (int i = 0; i < QK4_NL/2; ++i) {
2124
+ b->qs[i] = q[i];
2125
+ }
2126
+ }
2127
+
2128
+ kernel void kernel_convert_block_iq4_nl_noshuffle(
2129
+ global struct block_iq4_nl * src0,
2130
+ global uchar * dst_q,
2131
+ global half * dst_d,
2132
+ uchar mask_0F,
2133
+ uchar mask_F0,
2134
+ ulong n_blk
2135
+ ) {
2136
+ if (get_global_id(0) >= n_blk) {
2137
+ return;
2138
+ }
2139
+ global struct block_iq4_nl * b = (global struct block_iq4_nl *) src0 + get_global_id(0);
2140
+ global uchar * q = (global uchar *) dst_q + QK4_NL/2*get_global_id(0);
2141
+ global half * d = (global half *) dst_d + get_global_id(0);
2142
+
2143
+ *d = b->d;
2144
+ for (int i = 0; i < QK4_NL/4; ++i) {
2145
+ uchar x0 = b->qs[2*i + 0];
2146
+ uchar x1 = b->qs[2*i + 1];
2147
+
2148
+ q[i + 0 ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
2149
+ q[i + QK4_NL/4] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
2150
+ }
2151
+ }
2152
+
2153
+ kernel void kernel_restore_block_iq4_nl_noshuffle(
2154
+ global uchar * src_q,
2155
+ global half * src_d,
2156
+ global struct block_iq4_nl * dst,
2157
+ uchar mask_0F,
2158
+ uchar mask_F0,
2159
+ ulong n_blk
2160
+ ) {
2161
+ if (get_global_id(0) >= n_blk) {
2162
+ return;
2163
+ }
2164
+ global struct block_iq4_nl * b = (global struct block_iq4_nl *) dst + get_global_id(0);
2165
+ global uchar * q = (global uchar *) src_q + QK4_NL/2*get_global_id(0);
2166
+ global half * d = (global half *) src_d + get_global_id(0);
2167
+
2168
+ b->d = *d;
2169
+ for (int i = 0; i < QK4_NL/4; ++i) {
2170
+ uchar x0 = q[i + 0 ];
2171
+ uchar x1 = q[i + QK4_NL/4];
2172
+
2173
+ b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
2174
+ b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
2175
+ }
2176
+ }