llama_cpp 0.15.4 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -0,0 +1,1564 @@
1
+ #include "mmq.cuh"
2
+ #include "vecdotq.cuh"
3
+
4
+ typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
5
+ typedef void (*load_tiles_cuda_t)(
6
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
7
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
8
+ typedef float (*vec_dot_q_mul_mat_cuda_t)(
9
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
10
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
11
+ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
12
+ typedef void (mul_mat_q_t)(
13
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
14
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst);
15
+
16
+ struct mmq_arch_config_t {
17
+ int x;
18
+ int y;
19
+ int nwarps;
20
+ };
21
+
22
+ struct mmq_config_t {
23
+ mmq_arch_config_t rdna2;
24
+ mmq_arch_config_t rdna1;
25
+ mmq_arch_config_t ampere;
26
+ mmq_arch_config_t pascal;
27
+ };
28
+
29
+ constexpr mmq_config_t MMQ_CONFIG_Q4_0 = {
30
+ // x y nwarps
31
+ { 64, 128, 8},
32
+ { 64, 64, 8},
33
+ #ifdef CUDA_USE_TENSOR_CORES
34
+ { 4, 32, 4},
35
+ #else
36
+ { 64, 128, 4},
37
+ #endif // CUDA_USE_TENSOR_CORES
38
+ { 64, 64, 8},
39
+ };
40
+ constexpr mmq_config_t MMQ_CONFIG_Q4_1 = {
41
+ // x y nwarps
42
+ { 64, 128, 8},
43
+ { 64, 64, 8},
44
+ #ifdef CUDA_USE_TENSOR_CORES
45
+ { 4, 32, 4},
46
+ #else
47
+ { 64, 128, 4},
48
+ #endif // CUDA_USE_TENSOR_CORES
49
+ { 64, 64, 8},
50
+ };
51
+ constexpr mmq_config_t MMQ_CONFIG_Q5_0 = {
52
+ // x y nwarps
53
+ { 64, 128, 8},
54
+ { 64, 64, 8},
55
+ #ifdef CUDA_USE_TENSOR_CORES
56
+ { 4, 32, 4},
57
+ #else
58
+ {128, 64, 4},
59
+ #endif // CUDA_USE_TENSOR_CORES
60
+ { 64, 64, 8},
61
+ };
62
+ constexpr mmq_config_t MMQ_CONFIG_Q5_1 = {
63
+ // x y nwarps
64
+ { 64, 128, 8},
65
+ { 64, 64, 8},
66
+ #ifdef CUDA_USE_TENSOR_CORES
67
+ { 4, 32, 4},
68
+ #else
69
+ {128, 64, 4},
70
+ #endif // CUDA_USE_TENSOR_CORES
71
+ { 64, 64, 8},
72
+ };
73
+ constexpr mmq_config_t MMQ_CONFIG_Q8_0 = {
74
+ // x y nwarps
75
+ { 64, 128, 8},
76
+ { 64, 64, 8},
77
+ #ifdef CUDA_USE_TENSOR_CORES
78
+ { 4, 32, 4},
79
+ #else
80
+ {128, 64, 4},
81
+ #endif // CUDA_USE_TENSOR_CORES
82
+ { 64, 64, 8},
83
+ };
84
+ constexpr mmq_config_t MMQ_CONFIG_Q2_K = {
85
+ // x y nwarps
86
+ { 64, 128, 8},
87
+ {128, 32, 8},
88
+ #ifdef CUDA_USE_TENSOR_CORES
89
+ { 4, 32, 4},
90
+ #else
91
+ { 64, 128, 4},
92
+ #endif // CUDA_USE_TENSOR_CORES
93
+ { 64, 64, 8},
94
+ };
95
+ constexpr mmq_config_t MMQ_CONFIG_Q3_K = {
96
+ // x y nwarps
97
+ {128, 64, 8},
98
+ { 32, 128, 8},
99
+ #ifdef CUDA_USE_TENSOR_CORES
100
+ { 4, 32, 4},
101
+ #else
102
+ {128, 128, 4},
103
+ #endif // CUDA_USE_TENSOR_CORES
104
+ { 64, 64, 8},
105
+ };
106
+ constexpr mmq_config_t MMQ_CONFIG_Q4_K = {
107
+ // x y nwarps
108
+ { 64, 128, 8},
109
+ { 32, 64, 8},
110
+ #ifdef CUDA_USE_TENSOR_CORES
111
+ { 4, 32, 4},
112
+ #else
113
+ { 64, 128, 4},
114
+ #endif // CUDA_USE_TENSOR_CORES
115
+ { 64, 64, 8},
116
+ };
117
+ constexpr mmq_config_t MMQ_CONFIG_Q5_K = {
118
+ // x y nwarps
119
+ { 64, 128, 8},
120
+ { 32, 64, 8},
121
+ #ifdef CUDA_USE_TENSOR_CORES
122
+ { 4, 32, 4},
123
+ #else
124
+ { 64, 128, 4},
125
+ #endif // CUDA_USE_TENSOR_CORES
126
+ { 64, 64, 8},
127
+ };
128
+ constexpr mmq_config_t MMQ_CONFIG_Q6_K = {
129
+ // x y nwarps
130
+ { 64, 128, 8},
131
+ { 32, 64, 8},
132
+ #ifdef CUDA_USE_TENSOR_CORES
133
+ { 4, 32, 4},
134
+ #else
135
+ { 64, 64, 4},
136
+ #endif // CUDA_USE_TENSOR_CORES
137
+ { 64, 64, 8},
138
+ };
139
+
140
+ // ------------------------------------------------------------
141
+
142
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
143
+ GGML_UNUSED(x_qh);
144
+ GGML_UNUSED(x_sc);
145
+
146
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
147
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
148
+
149
+ *x_ql = tile_x_qs;
150
+ *x_dm = (half2 *) tile_x_d;
151
+ }
152
+
153
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
154
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
155
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
156
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
157
+ GGML_CUDA_ASSUME(i_offset >= 0);
158
+ GGML_CUDA_ASSUME(i_offset < nwarps);
159
+ GGML_CUDA_ASSUME(k >= 0);
160
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
161
+
162
+ const int kbx = k / QI4_0;
163
+ const int kqsx = k % QI4_0;
164
+
165
+ const block_q4_0 * bx0 = (const block_q4_0 *) vx;
166
+
167
+ float * x_dmf = (float *) x_dm;
168
+
169
+ #pragma unroll
170
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
171
+ int i = i0 + i_offset;
172
+
173
+ if (need_check) {
174
+ i = min(i, i_max);
175
+ }
176
+
177
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
178
+
179
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
180
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
181
+ }
182
+
183
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
184
+ const int kbxd = k % blocks_per_tile_x_row;
185
+
186
+ #pragma unroll
187
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
188
+ int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
189
+
190
+ if (need_check) {
191
+ i = min(i, i_max);
192
+ }
193
+
194
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
195
+
196
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
197
+ }
198
+ }
199
+
200
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
201
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
202
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
203
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
204
+
205
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
206
+ const float * x_dmf = (const float *) x_dm;
207
+
208
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
209
+
210
+ #pragma unroll
211
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
212
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
213
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
214
+ }
215
+
216
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
217
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
218
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
219
+ }
220
+
221
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
222
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
223
+
224
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
225
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
226
+
227
+ *x_ql = tile_x_qs;
228
+ *x_dm = tile_x_dm;
229
+ }
230
+
231
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
232
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
233
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
234
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
235
+
236
+ GGML_CUDA_ASSUME(i_offset >= 0);
237
+ GGML_CUDA_ASSUME(i_offset < nwarps);
238
+ GGML_CUDA_ASSUME(k >= 0);
239
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
240
+
241
+ const int kbx = k / QI4_1;
242
+ const int kqsx = k % QI4_1;
243
+
244
+ const block_q4_1 * bx0 = (const block_q4_1 *) vx;
245
+
246
+ #pragma unroll
247
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
248
+ int i = i0 + i_offset;
249
+
250
+ if (need_check) {
251
+ i = min(i, i_max);
252
+ }
253
+
254
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
255
+
256
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
257
+ }
258
+
259
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
260
+ const int kbxd = k % blocks_per_tile_x_row;
261
+
262
+ #pragma unroll
263
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
264
+ int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
265
+
266
+ if (need_check) {
267
+ i = min(i, i_max);
268
+ }
269
+
270
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
271
+
272
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
273
+ }
274
+ }
275
+
276
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
277
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
278
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
279
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
280
+
281
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
282
+
283
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
284
+
285
+ #pragma unroll
286
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
287
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
288
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
289
+ }
290
+
291
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
292
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
293
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
294
+ }
295
+
296
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
297
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
298
+
299
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
300
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
301
+
302
+ *x_ql = tile_x_ql;
303
+ *x_dm = (half2 *) tile_x_d;
304
+ }
305
+
306
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
307
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
308
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
309
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
310
+
311
+ GGML_CUDA_ASSUME(i_offset >= 0);
312
+ GGML_CUDA_ASSUME(i_offset < nwarps);
313
+ GGML_CUDA_ASSUME(k >= 0);
314
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
315
+
316
+ const int kbx = k / QI5_0;
317
+ const int kqsx = k % QI5_0;
318
+
319
+ const block_q5_0 * bx0 = (const block_q5_0 *) vx;
320
+
321
+ #pragma unroll
322
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
323
+ int i = i0 + i_offset;
324
+
325
+ if (need_check) {
326
+ i = min(i, i_max);
327
+ }
328
+
329
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
330
+
331
+ const int ql = get_int_from_uint8(bxi->qs, kqsx);
332
+ const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
333
+
334
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
335
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
336
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
337
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
338
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
339
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
340
+
341
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
342
+
343
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
344
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
345
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
346
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
347
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
348
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
349
+
350
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
351
+ }
352
+
353
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
354
+ const int kbxd = k % blocks_per_tile_x_row;
355
+ float * x_dmf = (float *) x_dm;
356
+
357
+ #pragma unroll
358
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
359
+ int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
360
+
361
+ if (need_check) {
362
+ i = min(i, i_max);
363
+ }
364
+
365
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
366
+
367
+ x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
368
+ }
369
+ }
370
+
371
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
372
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
373
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
374
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
375
+
376
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
377
+ const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
378
+ const float * x_dmf = (const float *) x_dm;
379
+ const float * y_df = (const float *) y_ds;
380
+
381
+ int u[2*VDR_Q5_0_Q8_1_MMQ];
382
+
383
+ #pragma unroll
384
+ for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
385
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
386
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
387
+ }
388
+
389
+ return vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
390
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
391
+ }
392
+
393
+
394
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
395
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
396
+
397
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
398
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
399
+
400
+ *x_ql = tile_x_ql;
401
+ *x_dm = tile_x_dm;
402
+ }
403
+
404
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
405
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
406
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
407
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
408
+
409
+ GGML_CUDA_ASSUME(i_offset >= 0);
410
+ GGML_CUDA_ASSUME(i_offset < nwarps);
411
+ GGML_CUDA_ASSUME(k >= 0);
412
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
413
+
414
+ const int kbx = k / QI5_1;
415
+ const int kqsx = k % QI5_1;
416
+
417
+ const block_q5_1 * bx0 = (const block_q5_1 *) vx;
418
+
419
+ #pragma unroll
420
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
421
+ int i = i0 + i_offset;
422
+
423
+ if (need_check) {
424
+ i = min(i, i_max);
425
+ }
426
+
427
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
428
+
429
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
430
+ const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
431
+
432
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
433
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
434
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
435
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
436
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
437
+
438
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
439
+
440
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
441
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
442
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
443
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
444
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
445
+
446
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
447
+ }
448
+
449
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
450
+ const int kbxd = k % blocks_per_tile_x_row;
451
+
452
+ #pragma unroll
453
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
454
+ int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
455
+
456
+ if (need_check) {
457
+ i = min(i, i_max);
458
+ }
459
+
460
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
461
+
462
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
463
+ }
464
+ }
465
+
466
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
467
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
468
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
469
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
470
+
471
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
472
+ const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
473
+
474
+ int u[2*VDR_Q5_1_Q8_1_MMQ];
475
+
476
+ #pragma unroll
477
+ for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
478
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
479
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
480
+ }
481
+
482
+ return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
483
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
484
+ }
485
+
486
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
487
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
488
+
489
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
490
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
491
+
492
+ *x_ql = tile_x_qs;
493
+ *x_dm = (half2 *) tile_x_d;
494
+ }
495
+
496
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
497
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
498
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
499
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
500
+
501
+ GGML_CUDA_ASSUME(i_offset >= 0);
502
+ GGML_CUDA_ASSUME(i_offset < nwarps);
503
+ GGML_CUDA_ASSUME(k >= 0);
504
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
505
+
506
+ const int kbx = k / QI8_0;
507
+ const int kqsx = k % QI8_0;
508
+ float * x_dmf = (float *) x_dm;
509
+
510
+ const block_q8_0 * bx0 = (const block_q8_0 *) vx;
511
+
512
+ #pragma unroll
513
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
514
+ int i = i0 + i_offset;
515
+
516
+ if (need_check) {
517
+ i = min(i, i_max);
518
+ }
519
+
520
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
521
+
522
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
523
+ }
524
+
525
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
526
+ const int kbxd = k % blocks_per_tile_x_row;
527
+
528
+ #pragma unroll
529
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
530
+ int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
531
+
532
+ if (need_check) {
533
+ i = min(i, i_max);
534
+ }
535
+
536
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
537
+
538
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
539
+ }
540
+ }
541
+
542
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
543
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
544
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
545
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
546
+
547
+ const float * x_dmf = (const float *) x_dm;
548
+ const float * y_df = (const float *) y_ds;
549
+
550
+ return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
551
+ (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
552
+ y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
553
+ }
554
+
555
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
556
+ GGML_UNUSED(x_qh);
557
+
558
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
559
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
560
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
561
+
562
+ *x_ql = tile_x_ql;
563
+ *x_dm = tile_x_dm;
564
+ *x_sc = tile_x_sc;
565
+ }
566
+
567
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
568
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
569
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
570
+ GGML_UNUSED(x_qh);
571
+
572
+ GGML_CUDA_ASSUME(i_offset >= 0);
573
+ GGML_CUDA_ASSUME(i_offset < nwarps);
574
+ GGML_CUDA_ASSUME(k >= 0);
575
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
576
+
577
+ const int kbx = k / QI2_K;
578
+ const int kqsx = k % QI2_K;
579
+
580
+ const block_q2_K * bx0 = (const block_q2_K *) vx;
581
+
582
+ #pragma unroll
583
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
584
+ int i = i0 + i_offset;
585
+
586
+ if (need_check) {
587
+ i = min(i, i_max);
588
+ }
589
+
590
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
591
+
592
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
593
+ }
594
+
595
+ const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
596
+ const int kbxd = k % blocks_per_tile_x_row;
597
+
598
+ #pragma unroll
599
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
600
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
601
+
602
+ if (need_check) {
603
+ i = min(i, i_max);
604
+ }
605
+
606
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
607
+
608
+ x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
609
+ }
610
+
611
+ #pragma unroll
612
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
613
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
614
+
615
+ if (need_check) {
616
+ i = min(i, i_max);
617
+ }
618
+
619
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
620
+
621
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
622
+ }
623
+ }
624
+
625
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
626
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
627
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
628
+ GGML_UNUSED(x_qh);
629
+
630
+ const int kbx = k / QI2_K;
631
+ const int ky = (k % QI2_K) * QR2_K;
632
+ const float * y_df = (const float *) y_ds;
633
+
634
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
635
+
636
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
637
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
638
+
639
+ #pragma unroll
640
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
641
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
642
+ }
643
+
644
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
645
+
646
+ const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
647
+ return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
648
+ }
649
+
650
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
651
+
652
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
653
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
654
+ __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
655
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
656
+
657
+ *x_ql = tile_x_ql;
658
+ *x_dm = tile_x_dm;
659
+ *x_qh = tile_x_qh;
660
+ *x_sc = tile_x_sc;
661
+ }
662
+
663
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
664
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
665
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
666
+
667
+ GGML_CUDA_ASSUME(i_offset >= 0);
668
+ GGML_CUDA_ASSUME(i_offset < nwarps);
669
+ GGML_CUDA_ASSUME(k >= 0);
670
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
671
+
672
+ const int kbx = k / QI3_K;
673
+ const int kqsx = k % QI3_K;
674
+
675
+ const block_q3_K * bx0 = (const block_q3_K *) vx;
676
+
677
+ #pragma unroll
678
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
679
+ int i = i0 + i_offset;
680
+
681
+ if (need_check) {
682
+ i = min(i, i_max);
683
+ }
684
+
685
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
686
+
687
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
688
+ }
689
+
690
+ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
691
+ const int kbxd = k % blocks_per_tile_x_row;
692
+ float * x_dmf = (float *) x_dm;
693
+
694
+ #pragma unroll
695
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
696
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
697
+
698
+ if (need_check) {
699
+ i = min(i, i_max);
700
+ }
701
+
702
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
703
+
704
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
705
+ }
706
+
707
+ #pragma unroll
708
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
709
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
710
+
711
+ if (need_check) {
712
+ i = min(i, i_max);
713
+ }
714
+
715
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
716
+
717
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
718
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
719
+ }
720
+
721
+ #pragma unroll
722
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
723
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
724
+
725
+ if (need_check) {
726
+ i = min(i, i_max);
727
+ }
728
+
729
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
730
+
731
+ const int ksc = k % (QI3_K/4);
732
+
733
+ const int ksc_low = ksc % (QI3_K/8);
734
+ const int shift_low = 4 * (ksc / (QI3_K/8));
735
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
736
+
737
+ const int ksc_high = QI3_K/8;
738
+ const int shift_high = 2 * ksc;
739
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
740
+
741
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
742
+
743
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
744
+ }
745
+ }
746
+
747
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
748
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
749
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
750
+
751
+ const int kbx = k / QI3_K;
752
+ const int ky = (k % QI3_K) * QR3_K;
753
+ const float * x_dmf = (const float *) x_dm;
754
+ const float * y_df = (const float *) y_ds;
755
+
756
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
757
+
758
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
759
+
760
+ #pragma unroll
761
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
762
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
763
+ const int shift = 2 * ((ky % 32) / 8);
764
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
765
+
766
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
767
+ const int vlh = (vh << 2) & 0x04040404;
768
+
769
+ v[l] = __vsubss4(vll, vlh);
770
+ }
771
+
772
+ const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
773
+ return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
774
+ }
775
+
776
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
777
+ GGML_UNUSED(x_qh);
778
+
779
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
780
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
781
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
782
+
783
+ *x_ql = tile_x_ql;
784
+ *x_dm = tile_x_dm;
785
+ *x_sc = tile_x_sc;
786
+ }
787
+
788
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
789
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
790
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
791
+ GGML_UNUSED(x_qh);
792
+
793
+ GGML_CUDA_ASSUME(i_offset >= 0);
794
+ GGML_CUDA_ASSUME(i_offset < nwarps);
795
+ GGML_CUDA_ASSUME(k >= 0);
796
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
797
+
798
+ const int kbx = k / QI4_K; // == 0 if QK_K == 256
799
+ const int kqsx = k % QI4_K; // == k if QK_K == 256
800
+
801
+ const block_q4_K * bx0 = (const block_q4_K *) vx;
802
+
803
+ #pragma unroll
804
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
805
+ int i = i0 + i_offset;
806
+
807
+ if (need_check) {
808
+ i = min(i, i_max);
809
+ }
810
+
811
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
812
+
813
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
814
+ }
815
+
816
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
817
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
818
+
819
+ #pragma unroll
820
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
821
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
822
+
823
+ if (need_check) {
824
+ i = min(i, i_max);
825
+ }
826
+
827
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
828
+
829
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
830
+ }
831
+
832
+ #pragma unroll
833
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
834
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
835
+
836
+ if (need_check) {
837
+ i = min(i, i_max);
838
+ }
839
+
840
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
841
+
842
+ const int * scales = (const int *) bxi->scales;
843
+
844
+ const int ksc = k % (WARP_SIZE/8);
845
+
846
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
847
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
848
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
849
+
850
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
851
+ }
852
+ }
853
+
854
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
855
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
856
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
857
+ GGML_UNUSED(x_qh);
858
+
859
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
860
+
861
+ const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
862
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
863
+ x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
864
+ }
865
+
866
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
867
+ GGML_UNUSED(x_qh);
868
+
869
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
870
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
871
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
872
+
873
+ *x_ql = tile_x_ql;
874
+ *x_dm = tile_x_dm;
875
+ *x_sc = tile_x_sc;
876
+ }
877
+
878
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
879
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
880
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
881
+ GGML_UNUSED(x_qh);
882
+
883
+ GGML_CUDA_ASSUME(i_offset >= 0);
884
+ GGML_CUDA_ASSUME(i_offset < nwarps);
885
+ GGML_CUDA_ASSUME(k >= 0);
886
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
887
+
888
+ const int kbx = k / QI5_K; // == 0 if QK_K == 256
889
+ const int kqsx = k % QI5_K; // == k if QK_K == 256
890
+
891
+ const block_q5_K * bx0 = (const block_q5_K *) vx;
892
+
893
+ #pragma unroll
894
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
895
+ int i = i0 + i_offset;
896
+
897
+ if (need_check) {
898
+ i = min(i, i_max);
899
+ }
900
+
901
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
902
+ const int ky = QR5_K*kqsx;
903
+
904
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
905
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
906
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
907
+
908
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
909
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
910
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
911
+
912
+ const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
913
+ const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
914
+
915
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
916
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
917
+ }
918
+
919
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
920
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
921
+
922
+ #pragma unroll
923
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
924
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
925
+
926
+ if (need_check) {
927
+ i = min(i, i_max);
928
+ }
929
+
930
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
931
+
932
+ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
933
+ }
934
+
935
+ #pragma unroll
936
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
937
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
938
+
939
+ if (need_check) {
940
+ i = min(i, i_max);
941
+ }
942
+
943
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
944
+
945
+ const int * scales = (const int *) bxi->scales;
946
+
947
+ const int ksc = k % (WARP_SIZE/8);
948
+
949
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
950
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
951
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
952
+
953
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
954
+ }
955
+ }
956
+
957
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
958
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
959
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
960
+ GGML_UNUSED(x_qh);
961
+
962
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
963
+
964
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
965
+ const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
966
+ return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
967
+ x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
968
+ }
969
+
970
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
971
+ GGML_UNUSED(x_qh);
972
+
973
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
974
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
975
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
976
+
977
+ *x_ql = tile_x_ql;
978
+ *x_dm = tile_x_dm;
979
+ *x_sc = tile_x_sc;
980
+ }
981
+
982
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
983
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
984
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
985
+ GGML_UNUSED(x_qh);
986
+
987
+ GGML_CUDA_ASSUME(i_offset >= 0);
988
+ GGML_CUDA_ASSUME(i_offset < nwarps);
989
+ GGML_CUDA_ASSUME(k >= 0);
990
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
991
+
992
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
993
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
994
+
995
+ const block_q6_K * bx0 = (const block_q6_K *) vx;
996
+
997
+ #pragma unroll
998
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
999
+ int i = i0 + i_offset;
1000
+
1001
+ if (need_check) {
1002
+ i = min(i, i_max);
1003
+ }
1004
+
1005
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
1006
+ const int ky = QR6_K*kqsx;
1007
+
1008
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
1009
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
1010
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
1011
+
1012
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
1013
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
1014
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
1015
+
1016
+ const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
1017
+ const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
1018
+
1019
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
1020
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
1021
+ }
1022
+
1023
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
1024
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1025
+ float * x_dmf = (float *) x_dm;
1026
+
1027
+ #pragma unroll
1028
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
1029
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
1030
+
1031
+ if (need_check) {
1032
+ i = min(i, i_max);
1033
+ }
1034
+
1035
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
1036
+
1037
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
1038
+ }
1039
+
1040
+ #pragma unroll
1041
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
1042
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
1043
+
1044
+ if (need_check) {
1045
+ i = min(i, i_max);
1046
+ }
1047
+
1048
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
1049
+
1050
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
1051
+ }
1052
+ }
1053
+
1054
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
1055
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1056
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1057
+ GGML_UNUSED(x_qh);
1058
+
1059
+ const float * x_dmf = (const float *) x_dm;
1060
+ const float * y_df = (const float *) y_ds;
1061
+
1062
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
1063
+
1064
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
1065
+ const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
1066
+ return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
1067
+ }
1068
+
1069
+ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
1070
+ allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
1071
+ static __device__ __forceinline__ void mul_mat_q(
1072
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1073
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1074
+
1075
+ const block_q_t * x = (const block_q_t *) vx;
1076
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1077
+
1078
+ const int blocks_per_row_x = ncols_x / qk;
1079
+ const int blocks_per_col_y = nrows_y / QK8_1;
1080
+ const int blocks_per_warp = WARP_SIZE / qi;
1081
+
1082
+ const int & ncols_dst = ncols_y;
1083
+
1084
+ const int row_dst_0 = blockIdx.x*mmq_y;
1085
+ const int & row_x_0 = row_dst_0;
1086
+
1087
+ const int col_dst_0 = blockIdx.y*mmq_x;
1088
+ const int & col_y_0 = col_dst_0;
1089
+
1090
+ int * tile_x_ql = nullptr;
1091
+ half2 * tile_x_dm = nullptr;
1092
+ int * tile_x_qh = nullptr;
1093
+ int * tile_x_sc = nullptr;
1094
+
1095
+ allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
1096
+
1097
+ __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
1098
+ __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
1099
+
1100
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
1101
+
1102
+ for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
1103
+
1104
+ load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
1105
+ threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
1106
+
1107
+ #pragma unroll
1108
+ for (int ir = 0; ir < qr; ++ir) {
1109
+ const int kqs = ir*WARP_SIZE + threadIdx.x;
1110
+ const int kbxd = kqs / QI8_1;
1111
+
1112
+ #pragma unroll
1113
+ for (int i = 0; i < mmq_x; i += nwarps) {
1114
+ const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1115
+
1116
+ const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
1117
+
1118
+ const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
1119
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
1120
+ }
1121
+
1122
+ #pragma unroll
1123
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
1124
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
1125
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
1126
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
1127
+
1128
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
1129
+ const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
1130
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
1131
+ if (need_sum) {
1132
+ *dsi_dst = *dsi_src;
1133
+ } else {
1134
+ float * dfi_dst = (float *) dsi_dst;
1135
+ *dfi_dst = __low2float(*dsi_src);
1136
+ }
1137
+ }
1138
+
1139
+ __syncthreads();
1140
+
1141
+ // #pragma unroll // unrolling this loop causes too much register pressure
1142
+ for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
1143
+ #pragma unroll
1144
+ for (int j = 0; j < mmq_x; j += nwarps) {
1145
+ #pragma unroll
1146
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
1147
+ sum[i/WARP_SIZE][j/nwarps] += vec_dot(
1148
+ tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
1149
+ threadIdx.x + i, threadIdx.y + j, k);
1150
+ }
1151
+ }
1152
+ }
1153
+
1154
+ __syncthreads();
1155
+ }
1156
+ }
1157
+
1158
+ #pragma unroll
1159
+ for (int j = 0; j < mmq_x; j += nwarps) {
1160
+ const int col_dst = col_dst_0 + j + threadIdx.y;
1161
+
1162
+ if (col_dst >= ncols_dst) {
1163
+ return;
1164
+ }
1165
+
1166
+ #pragma unroll
1167
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
1168
+ const int row_dst = row_dst_0 + threadIdx.x + i;
1169
+
1170
+ if (row_dst >= nrows_dst) {
1171
+ continue;
1172
+ }
1173
+
1174
+ dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
1175
+ }
1176
+ }
1177
+ }
1178
+
1179
+ static constexpr __device__ mmq_arch_config_t get_arch_config_device(mmq_config_t mmq_config) {
1180
+
1181
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1182
+
1183
+ #if defined(RDNA3) || defined(RDNA2)
1184
+ return mmq_config.rdna2;
1185
+ #else
1186
+ return mmq_config.rdna1;
1187
+ #endif // defined(RDNA3) || defined(RDNA2)
1188
+
1189
+ #else
1190
+
1191
+ #if __CUDA_ARCH__ >= CC_VOLTA
1192
+ return mmq_config.ampere;
1193
+ #else
1194
+ return mmq_config.pascal;
1195
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
1196
+
1197
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1198
+ }
1199
+
1200
+ template <bool need_check> static __global__ void
1201
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1202
+ #if defined(RDNA3) || defined(RDNA2)
1203
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2)
1204
+ #endif // defined(RDNA3) || defined(RDNA2)
1205
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1206
+ mul_mat_q4_0(
1207
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1208
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1209
+
1210
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1211
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_0);
1212
+
1213
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_0<arch_config.y>,
1214
+ load_tiles_q4_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
1215
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1216
+ #else
1217
+ GGML_UNUSED(get_arch_config_device);
1218
+ GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
1219
+ NO_DEVICE_CODE;
1220
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1221
+ }
1222
+
1223
+ template <bool need_check> static __global__ void
1224
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1225
+ #if defined(RDNA3) || defined(RDNA2)
1226
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.rdna2.nwarps, 2)
1227
+ #endif // defined(RDNA3) || defined(RDNA2)
1228
+ #elif __CUDA_ARCH__ < CC_VOLTA
1229
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.pascal.nwarps, 2)
1230
+ #endif // __CUDA_ARCH__ < CC_VOLTA
1231
+ mul_mat_q4_1(
1232
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1233
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1234
+
1235
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1236
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_1);
1237
+
1238
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_1<arch_config.y>,
1239
+ load_tiles_q4_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
1240
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1241
+ #else
1242
+ GGML_UNUSED(get_arch_config_device);
1243
+ GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
1244
+ NO_DEVICE_CODE;
1245
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1246
+ }
1247
+
1248
+ template <bool need_check> static __global__ void
1249
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1250
+ #if defined(RDNA3) || defined(RDNA2)
1251
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_0.rdna2.nwarps, 2)
1252
+ #endif // defined(RDNA3) || defined(RDNA2)
1253
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1254
+ mul_mat_q5_0(
1255
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1256
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1257
+
1258
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1259
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_0);
1260
+
1261
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_0<arch_config.y>,
1262
+ load_tiles_q5_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
1263
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1264
+ #else
1265
+ GGML_UNUSED(get_arch_config_device);
1266
+ GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
1267
+ NO_DEVICE_CODE;
1268
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1269
+ }
1270
+
1271
+ template <bool need_check> static __global__ void
1272
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1273
+ #if defined(RDNA3) || defined(RDNA2)
1274
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_1.rdna2.nwarps, 2)
1275
+ #endif // defined(RDNA3) || defined(RDNA2)
1276
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1277
+ mul_mat_q5_1(
1278
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1279
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1280
+
1281
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1282
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_1);
1283
+
1284
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_1<arch_config.y>,
1285
+ load_tiles_q5_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
1286
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1287
+ #else
1288
+ GGML_UNUSED(get_arch_config_device);
1289
+ GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
1290
+ NO_DEVICE_CODE;
1291
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1292
+ }
1293
+
1294
+ template <bool need_check> static __global__ void
1295
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1296
+ #if defined(RDNA3) || defined(RDNA2)
1297
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q8_0.rdna2.nwarps, 2)
1298
+ #endif // defined(RDNA3) || defined(RDNA2)
1299
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1300
+ mul_mat_q8_0(
1301
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1302
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1303
+
1304
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1305
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0);
1306
+
1307
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q8_0<arch_config.y>,
1308
+ load_tiles_q8_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
1309
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1310
+ #else
1311
+ GGML_UNUSED(get_arch_config_device);
1312
+ GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
1313
+ NO_DEVICE_CODE;
1314
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1315
+ }
1316
+
1317
+ template <bool need_check> static __global__ void
1318
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1319
+ #if defined(RDNA3) || defined(RDNA2)
1320
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q2_K.rdna2.nwarps, 2)
1321
+ #endif // defined(RDNA3) || defined(RDNA2)
1322
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1323
+ mul_mat_q2_K(
1324
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1325
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1326
+
1327
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1328
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q2_K);
1329
+
1330
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q2_K<arch_config.y>,
1331
+ load_tiles_q2_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
1332
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1333
+ #else
1334
+ GGML_UNUSED(get_arch_config_device);
1335
+ GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
1336
+ NO_DEVICE_CODE;
1337
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1338
+ }
1339
+
1340
+ template <bool need_check> static __global__ void
1341
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1342
+ #if defined(RDNA3) || defined(RDNA2)
1343
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.rdna2.nwarps, 2)
1344
+ #endif // defined(RDNA3) || defined(RDNA2)
1345
+ #elif __CUDA_ARCH__ < CC_VOLTA
1346
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.pascal.nwarps, 2)
1347
+ #endif // __CUDA_ARCH__ < CC_VOLTA
1348
+ mul_mat_q3_K(
1349
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1350
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1351
+
1352
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1353
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q3_K);
1354
+
1355
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q3_K<arch_config.y>,
1356
+ load_tiles_q3_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
1357
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1358
+ #else
1359
+ GGML_UNUSED(get_arch_config_device);
1360
+ GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
1361
+ NO_DEVICE_CODE;
1362
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1363
+ }
1364
+
1365
+ template <bool need_check> static __global__ void
1366
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1367
+ #if defined(RDNA3) || defined(RDNA2)
1368
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.rdna2.nwarps, 2)
1369
+ #endif // defined(RDNA3) || defined(RDNA2)
1370
+ #elif __CUDA_ARCH__ < CC_VOLTA
1371
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1372
+ #endif // __CUDA_ARCH__ < CC_VOLTA
1373
+ mul_mat_q4_K(
1374
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1375
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1376
+
1377
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1378
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_K);
1379
+
1380
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_K<arch_config.y>,
1381
+ load_tiles_q4_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
1382
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1383
+ #else
1384
+ GGML_UNUSED(get_arch_config_device);
1385
+ GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
1386
+ NO_DEVICE_CODE;
1387
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1388
+ }
1389
+
1390
+ template <bool need_check> static __global__ void
1391
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1392
+ #if defined(RDNA3) || defined(RDNA2)
1393
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_K.rdna2.nwarps, 2)
1394
+ #endif // defined(RDNA3) || defined(RDNA2)
1395
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1396
+ mul_mat_q5_K(
1397
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1398
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1399
+
1400
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1401
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_K);
1402
+
1403
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_K<arch_config.y>,
1404
+ load_tiles_q5_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
1405
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1406
+ #else
1407
+ GGML_UNUSED(get_arch_config_device);
1408
+ GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
1409
+ NO_DEVICE_CODE;
1410
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1411
+ }
1412
+
1413
+ template <bool need_check> static __global__ void
1414
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1415
+ #if defined(RDNA3) || defined(RDNA2)
1416
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q6_K.rdna2.nwarps, 2)
1417
+ #endif // defined(RDNA3) || defined(RDNA2)
1418
+ #elif __CUDA_ARCH__ < CC_VOLTA
1419
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1420
+ #endif // __CUDA_ARCH__ < CC_VOLTA
1421
+ mul_mat_q6_K(
1422
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1423
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1424
+
1425
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1426
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q6_K);
1427
+
1428
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q6_K<arch_config.y>,
1429
+ load_tiles_q6_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
1430
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1431
+ #else
1432
+ GGML_UNUSED(get_arch_config_device);
1433
+ GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
1434
+ NO_DEVICE_CODE;
1435
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1436
+ }
1437
+
1438
+ #define MMQ_SWITCH_CASE(type_suffix) \
1439
+ case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) { \
1440
+ const bool need_check = false; \
1441
+ mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1442
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1443
+ } else { \
1444
+ const bool need_check = true; \
1445
+ mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1446
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1447
+ } break; \
1448
+
1449
+ void ggml_cuda_op_mul_mat_q(
1450
+ ggml_backend_cuda_context & ctx,
1451
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1452
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1453
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
1454
+
1455
+ const int64_t ne00 = src0->ne[0];
1456
+
1457
+ const int64_t ne10 = src1->ne[0];
1458
+ GGML_ASSERT(ne10 % QK8_1 == 0);
1459
+
1460
+ const int64_t ne0 = dst->ne[0];
1461
+
1462
+ const int64_t row_diff = row_high - row_low;
1463
+
1464
+ int id = ggml_cuda_get_device();
1465
+ const int compute_capability = ggml_cuda_info().devices[id].cc;
1466
+
1467
+ // the main device has a larger memory buffer to hold the results from all GPUs
1468
+ // nrows_dst == nrows of the matrix that the kernel writes into
1469
+ const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
1470
+
1471
+ mmq_config_t mmq_config;
1472
+
1473
+ switch (src0->type) {
1474
+ case GGML_TYPE_Q4_0:
1475
+ mmq_config = MMQ_CONFIG_Q4_0;
1476
+ break;
1477
+ case GGML_TYPE_Q4_1:
1478
+ mmq_config = MMQ_CONFIG_Q4_1;
1479
+ break;
1480
+ case GGML_TYPE_Q5_0:
1481
+ mmq_config = MMQ_CONFIG_Q5_0;
1482
+ break;
1483
+ case GGML_TYPE_Q5_1:
1484
+ mmq_config = MMQ_CONFIG_Q5_1;
1485
+ break;
1486
+ case GGML_TYPE_Q8_0:
1487
+ mmq_config = MMQ_CONFIG_Q8_0;
1488
+ break;
1489
+ case GGML_TYPE_Q2_K:
1490
+ mmq_config = MMQ_CONFIG_Q2_K;
1491
+ break;
1492
+ case GGML_TYPE_Q3_K:
1493
+ mmq_config = MMQ_CONFIG_Q3_K;
1494
+ break;
1495
+ case GGML_TYPE_Q4_K:
1496
+ mmq_config = MMQ_CONFIG_Q4_K;
1497
+ break;
1498
+ case GGML_TYPE_Q5_K:
1499
+ mmq_config = MMQ_CONFIG_Q5_K;
1500
+ break;
1501
+ case GGML_TYPE_Q6_K:
1502
+ mmq_config = MMQ_CONFIG_Q6_K;
1503
+ break;
1504
+ default:
1505
+ GGML_ASSERT(false);
1506
+ break;
1507
+ }
1508
+
1509
+ mmq_arch_config_t arch_config;
1510
+ if (compute_capability >= CC_RDNA2) {
1511
+ arch_config = mmq_config.rdna2;
1512
+ } else if (compute_capability >= CC_OFFSET_AMD) {
1513
+ arch_config = mmq_config.rdna1;
1514
+ } else if (compute_capability >= CC_VOLTA) {
1515
+ arch_config = mmq_config.ampere;
1516
+ } else if (compute_capability >= MIN_CC_DP4A) {
1517
+ arch_config = mmq_config.pascal;
1518
+ } else {
1519
+ GGML_ASSERT(false);
1520
+ }
1521
+
1522
+ const int block_num_x = (row_diff + arch_config.y - 1) / arch_config.y;
1523
+ const int block_num_y = (src1_ncols + arch_config.x - 1) / arch_config.x;
1524
+ const dim3 block_nums(block_num_x, block_num_y, 1);
1525
+ const dim3 block_dims(WARP_SIZE, arch_config.nwarps, 1);
1526
+
1527
+ switch (src0->type) {
1528
+ MMQ_SWITCH_CASE(4_0)
1529
+ MMQ_SWITCH_CASE(4_1)
1530
+ MMQ_SWITCH_CASE(5_0)
1531
+ MMQ_SWITCH_CASE(5_1)
1532
+ MMQ_SWITCH_CASE(8_0)
1533
+ MMQ_SWITCH_CASE(2_K)
1534
+ MMQ_SWITCH_CASE(3_K)
1535
+ MMQ_SWITCH_CASE(4_K)
1536
+ MMQ_SWITCH_CASE(5_K)
1537
+ MMQ_SWITCH_CASE(6_K)
1538
+ default:
1539
+ GGML_ASSERT(false);
1540
+ break;
1541
+ }
1542
+
1543
+ GGML_UNUSED(src1);
1544
+ GGML_UNUSED(dst);
1545
+ GGML_UNUSED(src1_ddf_i);
1546
+ }
1547
+
1548
+ bool ggml_cuda_supports_mmq(enum ggml_type type) {
1549
+ switch (type) {
1550
+ case GGML_TYPE_Q4_0:
1551
+ case GGML_TYPE_Q4_1:
1552
+ case GGML_TYPE_Q5_0:
1553
+ case GGML_TYPE_Q5_1:
1554
+ case GGML_TYPE_Q8_0:
1555
+ case GGML_TYPE_Q2_K:
1556
+ case GGML_TYPE_Q3_K:
1557
+ case GGML_TYPE_Q4_K:
1558
+ case GGML_TYPE_Q5_K:
1559
+ case GGML_TYPE_Q6_K:
1560
+ return true;
1561
+ default:
1562
+ return false;
1563
+ }
1564
+ }