llama_cpp 0.15.4 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -0,0 +1,1564 @@
1
+ #include "mmq.cuh"
2
+ #include "vecdotq.cuh"
3
+
4
+ typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
5
+ typedef void (*load_tiles_cuda_t)(
6
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
7
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
8
+ typedef float (*vec_dot_q_mul_mat_cuda_t)(
9
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
10
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
11
+ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
12
+ typedef void (mul_mat_q_t)(
13
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
14
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst);
15
+
16
+ struct mmq_arch_config_t {
17
+ int x;
18
+ int y;
19
+ int nwarps;
20
+ };
21
+
22
+ struct mmq_config_t {
23
+ mmq_arch_config_t rdna2;
24
+ mmq_arch_config_t rdna1;
25
+ mmq_arch_config_t ampere;
26
+ mmq_arch_config_t pascal;
27
+ };
28
+
29
+ constexpr mmq_config_t MMQ_CONFIG_Q4_0 = {
30
+ // x y nwarps
31
+ { 64, 128, 8},
32
+ { 64, 64, 8},
33
+ #ifdef CUDA_USE_TENSOR_CORES
34
+ { 4, 32, 4},
35
+ #else
36
+ { 64, 128, 4},
37
+ #endif // CUDA_USE_TENSOR_CORES
38
+ { 64, 64, 8},
39
+ };
40
+ constexpr mmq_config_t MMQ_CONFIG_Q4_1 = {
41
+ // x y nwarps
42
+ { 64, 128, 8},
43
+ { 64, 64, 8},
44
+ #ifdef CUDA_USE_TENSOR_CORES
45
+ { 4, 32, 4},
46
+ #else
47
+ { 64, 128, 4},
48
+ #endif // CUDA_USE_TENSOR_CORES
49
+ { 64, 64, 8},
50
+ };
51
+ constexpr mmq_config_t MMQ_CONFIG_Q5_0 = {
52
+ // x y nwarps
53
+ { 64, 128, 8},
54
+ { 64, 64, 8},
55
+ #ifdef CUDA_USE_TENSOR_CORES
56
+ { 4, 32, 4},
57
+ #else
58
+ {128, 64, 4},
59
+ #endif // CUDA_USE_TENSOR_CORES
60
+ { 64, 64, 8},
61
+ };
62
+ constexpr mmq_config_t MMQ_CONFIG_Q5_1 = {
63
+ // x y nwarps
64
+ { 64, 128, 8},
65
+ { 64, 64, 8},
66
+ #ifdef CUDA_USE_TENSOR_CORES
67
+ { 4, 32, 4},
68
+ #else
69
+ {128, 64, 4},
70
+ #endif // CUDA_USE_TENSOR_CORES
71
+ { 64, 64, 8},
72
+ };
73
+ constexpr mmq_config_t MMQ_CONFIG_Q8_0 = {
74
+ // x y nwarps
75
+ { 64, 128, 8},
76
+ { 64, 64, 8},
77
+ #ifdef CUDA_USE_TENSOR_CORES
78
+ { 4, 32, 4},
79
+ #else
80
+ {128, 64, 4},
81
+ #endif // CUDA_USE_TENSOR_CORES
82
+ { 64, 64, 8},
83
+ };
84
+ constexpr mmq_config_t MMQ_CONFIG_Q2_K = {
85
+ // x y nwarps
86
+ { 64, 128, 8},
87
+ {128, 32, 8},
88
+ #ifdef CUDA_USE_TENSOR_CORES
89
+ { 4, 32, 4},
90
+ #else
91
+ { 64, 128, 4},
92
+ #endif // CUDA_USE_TENSOR_CORES
93
+ { 64, 64, 8},
94
+ };
95
+ constexpr mmq_config_t MMQ_CONFIG_Q3_K = {
96
+ // x y nwarps
97
+ {128, 64, 8},
98
+ { 32, 128, 8},
99
+ #ifdef CUDA_USE_TENSOR_CORES
100
+ { 4, 32, 4},
101
+ #else
102
+ {128, 128, 4},
103
+ #endif // CUDA_USE_TENSOR_CORES
104
+ { 64, 64, 8},
105
+ };
106
+ constexpr mmq_config_t MMQ_CONFIG_Q4_K = {
107
+ // x y nwarps
108
+ { 64, 128, 8},
109
+ { 32, 64, 8},
110
+ #ifdef CUDA_USE_TENSOR_CORES
111
+ { 4, 32, 4},
112
+ #else
113
+ { 64, 128, 4},
114
+ #endif // CUDA_USE_TENSOR_CORES
115
+ { 64, 64, 8},
116
+ };
117
+ constexpr mmq_config_t MMQ_CONFIG_Q5_K = {
118
+ // x y nwarps
119
+ { 64, 128, 8},
120
+ { 32, 64, 8},
121
+ #ifdef CUDA_USE_TENSOR_CORES
122
+ { 4, 32, 4},
123
+ #else
124
+ { 64, 128, 4},
125
+ #endif // CUDA_USE_TENSOR_CORES
126
+ { 64, 64, 8},
127
+ };
128
+ constexpr mmq_config_t MMQ_CONFIG_Q6_K = {
129
+ // x y nwarps
130
+ { 64, 128, 8},
131
+ { 32, 64, 8},
132
+ #ifdef CUDA_USE_TENSOR_CORES
133
+ { 4, 32, 4},
134
+ #else
135
+ { 64, 64, 4},
136
+ #endif // CUDA_USE_TENSOR_CORES
137
+ { 64, 64, 8},
138
+ };
139
+
140
+ // ------------------------------------------------------------
141
+
142
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
143
+ GGML_UNUSED(x_qh);
144
+ GGML_UNUSED(x_sc);
145
+
146
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
147
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
148
+
149
+ *x_ql = tile_x_qs;
150
+ *x_dm = (half2 *) tile_x_d;
151
+ }
152
+
153
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
154
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
155
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
156
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
157
+ GGML_CUDA_ASSUME(i_offset >= 0);
158
+ GGML_CUDA_ASSUME(i_offset < nwarps);
159
+ GGML_CUDA_ASSUME(k >= 0);
160
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
161
+
162
+ const int kbx = k / QI4_0;
163
+ const int kqsx = k % QI4_0;
164
+
165
+ const block_q4_0 * bx0 = (const block_q4_0 *) vx;
166
+
167
+ float * x_dmf = (float *) x_dm;
168
+
169
+ #pragma unroll
170
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
171
+ int i = i0 + i_offset;
172
+
173
+ if (need_check) {
174
+ i = min(i, i_max);
175
+ }
176
+
177
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
178
+
179
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
180
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
181
+ }
182
+
183
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
184
+ const int kbxd = k % blocks_per_tile_x_row;
185
+
186
+ #pragma unroll
187
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
188
+ int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
189
+
190
+ if (need_check) {
191
+ i = min(i, i_max);
192
+ }
193
+
194
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
195
+
196
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
197
+ }
198
+ }
199
+
200
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
201
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
202
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
203
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
204
+
205
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
206
+ const float * x_dmf = (const float *) x_dm;
207
+
208
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
209
+
210
+ #pragma unroll
211
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
212
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
213
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
214
+ }
215
+
216
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
217
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
218
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
219
+ }
220
+
221
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
222
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
223
+
224
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
225
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
226
+
227
+ *x_ql = tile_x_qs;
228
+ *x_dm = tile_x_dm;
229
+ }
230
+
231
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
232
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
233
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
234
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
235
+
236
+ GGML_CUDA_ASSUME(i_offset >= 0);
237
+ GGML_CUDA_ASSUME(i_offset < nwarps);
238
+ GGML_CUDA_ASSUME(k >= 0);
239
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
240
+
241
+ const int kbx = k / QI4_1;
242
+ const int kqsx = k % QI4_1;
243
+
244
+ const block_q4_1 * bx0 = (const block_q4_1 *) vx;
245
+
246
+ #pragma unroll
247
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
248
+ int i = i0 + i_offset;
249
+
250
+ if (need_check) {
251
+ i = min(i, i_max);
252
+ }
253
+
254
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
255
+
256
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
257
+ }
258
+
259
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
260
+ const int kbxd = k % blocks_per_tile_x_row;
261
+
262
+ #pragma unroll
263
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
264
+ int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
265
+
266
+ if (need_check) {
267
+ i = min(i, i_max);
268
+ }
269
+
270
+ const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
271
+
272
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
273
+ }
274
+ }
275
+
276
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
277
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
278
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
279
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
280
+
281
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
282
+
283
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
284
+
285
+ #pragma unroll
286
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
287
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
288
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
289
+ }
290
+
291
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
292
+ (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
293
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
294
+ }
295
+
296
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
297
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
298
+
299
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
300
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
301
+
302
+ *x_ql = tile_x_ql;
303
+ *x_dm = (half2 *) tile_x_d;
304
+ }
305
+
306
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
307
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
308
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
309
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
310
+
311
+ GGML_CUDA_ASSUME(i_offset >= 0);
312
+ GGML_CUDA_ASSUME(i_offset < nwarps);
313
+ GGML_CUDA_ASSUME(k >= 0);
314
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
315
+
316
+ const int kbx = k / QI5_0;
317
+ const int kqsx = k % QI5_0;
318
+
319
+ const block_q5_0 * bx0 = (const block_q5_0 *) vx;
320
+
321
+ #pragma unroll
322
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
323
+ int i = i0 + i_offset;
324
+
325
+ if (need_check) {
326
+ i = min(i, i_max);
327
+ }
328
+
329
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
330
+
331
+ const int ql = get_int_from_uint8(bxi->qs, kqsx);
332
+ const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
333
+
334
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
335
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
336
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
337
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
338
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
339
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
340
+
341
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
342
+
343
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
344
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
345
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
346
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
347
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
348
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
349
+
350
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
351
+ }
352
+
353
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
354
+ const int kbxd = k % blocks_per_tile_x_row;
355
+ float * x_dmf = (float *) x_dm;
356
+
357
+ #pragma unroll
358
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
359
+ int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
360
+
361
+ if (need_check) {
362
+ i = min(i, i_max);
363
+ }
364
+
365
+ const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
366
+
367
+ x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
368
+ }
369
+ }
370
+
371
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
372
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
373
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
374
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
375
+
376
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
377
+ const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
378
+ const float * x_dmf = (const float *) x_dm;
379
+ const float * y_df = (const float *) y_ds;
380
+
381
+ int u[2*VDR_Q5_0_Q8_1_MMQ];
382
+
383
+ #pragma unroll
384
+ for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
385
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
386
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
387
+ }
388
+
389
+ return vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
390
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
391
+ }
392
+
393
+
394
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
395
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
396
+
397
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
398
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
399
+
400
+ *x_ql = tile_x_ql;
401
+ *x_dm = tile_x_dm;
402
+ }
403
+
404
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
405
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
406
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
407
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
408
+
409
+ GGML_CUDA_ASSUME(i_offset >= 0);
410
+ GGML_CUDA_ASSUME(i_offset < nwarps);
411
+ GGML_CUDA_ASSUME(k >= 0);
412
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
413
+
414
+ const int kbx = k / QI5_1;
415
+ const int kqsx = k % QI5_1;
416
+
417
+ const block_q5_1 * bx0 = (const block_q5_1 *) vx;
418
+
419
+ #pragma unroll
420
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
421
+ int i = i0 + i_offset;
422
+
423
+ if (need_check) {
424
+ i = min(i, i_max);
425
+ }
426
+
427
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
428
+
429
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
430
+ const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
431
+
432
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
433
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
434
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
435
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
436
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
437
+
438
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
439
+
440
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
441
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
442
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
443
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
444
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
445
+
446
+ x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
447
+ }
448
+
449
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
450
+ const int kbxd = k % blocks_per_tile_x_row;
451
+
452
+ #pragma unroll
453
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
454
+ int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
455
+
456
+ if (need_check) {
457
+ i = min(i, i_max);
458
+ }
459
+
460
+ const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
461
+
462
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
463
+ }
464
+ }
465
+
466
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
467
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
468
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
469
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
470
+
471
+ const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
472
+ const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
473
+
474
+ int u[2*VDR_Q5_1_Q8_1_MMQ];
475
+
476
+ #pragma unroll
477
+ for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
478
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
479
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
480
+ }
481
+
482
+ return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
483
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
484
+ }
485
+
486
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
487
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
488
+
489
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
490
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
491
+
492
+ *x_ql = tile_x_qs;
493
+ *x_dm = (half2 *) tile_x_d;
494
+ }
495
+
496
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
497
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
498
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
499
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
500
+
501
+ GGML_CUDA_ASSUME(i_offset >= 0);
502
+ GGML_CUDA_ASSUME(i_offset < nwarps);
503
+ GGML_CUDA_ASSUME(k >= 0);
504
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
505
+
506
+ const int kbx = k / QI8_0;
507
+ const int kqsx = k % QI8_0;
508
+ float * x_dmf = (float *) x_dm;
509
+
510
+ const block_q8_0 * bx0 = (const block_q8_0 *) vx;
511
+
512
+ #pragma unroll
513
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
514
+ int i = i0 + i_offset;
515
+
516
+ if (need_check) {
517
+ i = min(i, i_max);
518
+ }
519
+
520
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
521
+
522
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
523
+ }
524
+
525
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
526
+ const int kbxd = k % blocks_per_tile_x_row;
527
+
528
+ #pragma unroll
529
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
530
+ int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
531
+
532
+ if (need_check) {
533
+ i = min(i, i_max);
534
+ }
535
+
536
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
537
+
538
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
539
+ }
540
+ }
541
+
542
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
543
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
544
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
545
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
546
+
547
+ const float * x_dmf = (const float *) x_dm;
548
+ const float * y_df = (const float *) y_ds;
549
+
550
+ return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
551
+ (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
552
+ y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
553
+ }
554
+
555
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
556
+ GGML_UNUSED(x_qh);
557
+
558
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
559
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
560
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
561
+
562
+ *x_ql = tile_x_ql;
563
+ *x_dm = tile_x_dm;
564
+ *x_sc = tile_x_sc;
565
+ }
566
+
567
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
568
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
569
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
570
+ GGML_UNUSED(x_qh);
571
+
572
+ GGML_CUDA_ASSUME(i_offset >= 0);
573
+ GGML_CUDA_ASSUME(i_offset < nwarps);
574
+ GGML_CUDA_ASSUME(k >= 0);
575
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
576
+
577
+ const int kbx = k / QI2_K;
578
+ const int kqsx = k % QI2_K;
579
+
580
+ const block_q2_K * bx0 = (const block_q2_K *) vx;
581
+
582
+ #pragma unroll
583
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
584
+ int i = i0 + i_offset;
585
+
586
+ if (need_check) {
587
+ i = min(i, i_max);
588
+ }
589
+
590
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
591
+
592
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
593
+ }
594
+
595
+ const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
596
+ const int kbxd = k % blocks_per_tile_x_row;
597
+
598
+ #pragma unroll
599
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
600
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
601
+
602
+ if (need_check) {
603
+ i = min(i, i_max);
604
+ }
605
+
606
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
607
+
608
+ x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
609
+ }
610
+
611
+ #pragma unroll
612
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
613
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
614
+
615
+ if (need_check) {
616
+ i = min(i, i_max);
617
+ }
618
+
619
+ const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
620
+
621
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
622
+ }
623
+ }
624
+
625
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
626
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
627
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
628
+ GGML_UNUSED(x_qh);
629
+
630
+ const int kbx = k / QI2_K;
631
+ const int ky = (k % QI2_K) * QR2_K;
632
+ const float * y_df = (const float *) y_ds;
633
+
634
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
635
+
636
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
637
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
638
+
639
+ #pragma unroll
640
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
641
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
642
+ }
643
+
644
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
645
+
646
+ const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
647
+ return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
648
+ }
649
+
650
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
651
+
652
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
653
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
654
+ __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
655
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
656
+
657
+ *x_ql = tile_x_ql;
658
+ *x_dm = tile_x_dm;
659
+ *x_qh = tile_x_qh;
660
+ *x_sc = tile_x_sc;
661
+ }
662
+
663
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
664
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
665
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
666
+
667
+ GGML_CUDA_ASSUME(i_offset >= 0);
668
+ GGML_CUDA_ASSUME(i_offset < nwarps);
669
+ GGML_CUDA_ASSUME(k >= 0);
670
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
671
+
672
+ const int kbx = k / QI3_K;
673
+ const int kqsx = k % QI3_K;
674
+
675
+ const block_q3_K * bx0 = (const block_q3_K *) vx;
676
+
677
+ #pragma unroll
678
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
679
+ int i = i0 + i_offset;
680
+
681
+ if (need_check) {
682
+ i = min(i, i_max);
683
+ }
684
+
685
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
686
+
687
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
688
+ }
689
+
690
+ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
691
+ const int kbxd = k % blocks_per_tile_x_row;
692
+ float * x_dmf = (float *) x_dm;
693
+
694
+ #pragma unroll
695
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
696
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
697
+
698
+ if (need_check) {
699
+ i = min(i, i_max);
700
+ }
701
+
702
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
703
+
704
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
705
+ }
706
+
707
+ #pragma unroll
708
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
709
+ int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
710
+
711
+ if (need_check) {
712
+ i = min(i, i_max);
713
+ }
714
+
715
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
716
+
717
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
718
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
719
+ }
720
+
721
+ #pragma unroll
722
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
723
+ int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
724
+
725
+ if (need_check) {
726
+ i = min(i, i_max);
727
+ }
728
+
729
+ const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
730
+
731
+ const int ksc = k % (QI3_K/4);
732
+
733
+ const int ksc_low = ksc % (QI3_K/8);
734
+ const int shift_low = 4 * (ksc / (QI3_K/8));
735
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
736
+
737
+ const int ksc_high = QI3_K/8;
738
+ const int shift_high = 2 * ksc;
739
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
740
+
741
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
742
+
743
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
744
+ }
745
+ }
746
+
747
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
748
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
749
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
750
+
751
+ const int kbx = k / QI3_K;
752
+ const int ky = (k % QI3_K) * QR3_K;
753
+ const float * x_dmf = (const float *) x_dm;
754
+ const float * y_df = (const float *) y_ds;
755
+
756
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
757
+
758
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
759
+
760
+ #pragma unroll
761
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
762
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
763
+ const int shift = 2 * ((ky % 32) / 8);
764
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
765
+
766
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
767
+ const int vlh = (vh << 2) & 0x04040404;
768
+
769
+ v[l] = __vsubss4(vll, vlh);
770
+ }
771
+
772
+ const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
773
+ return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
774
+ }
775
+
776
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
777
+ GGML_UNUSED(x_qh);
778
+
779
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
780
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
781
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
782
+
783
+ *x_ql = tile_x_ql;
784
+ *x_dm = tile_x_dm;
785
+ *x_sc = tile_x_sc;
786
+ }
787
+
788
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
789
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
790
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
791
+ GGML_UNUSED(x_qh);
792
+
793
+ GGML_CUDA_ASSUME(i_offset >= 0);
794
+ GGML_CUDA_ASSUME(i_offset < nwarps);
795
+ GGML_CUDA_ASSUME(k >= 0);
796
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
797
+
798
+ const int kbx = k / QI4_K; // == 0 if QK_K == 256
799
+ const int kqsx = k % QI4_K; // == k if QK_K == 256
800
+
801
+ const block_q4_K * bx0 = (const block_q4_K *) vx;
802
+
803
+ #pragma unroll
804
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
805
+ int i = i0 + i_offset;
806
+
807
+ if (need_check) {
808
+ i = min(i, i_max);
809
+ }
810
+
811
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
812
+
813
+ x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
814
+ }
815
+
816
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
817
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
818
+
819
+ #pragma unroll
820
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
821
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
822
+
823
+ if (need_check) {
824
+ i = min(i, i_max);
825
+ }
826
+
827
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
828
+
829
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
830
+ }
831
+
832
+ #pragma unroll
833
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
834
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
835
+
836
+ if (need_check) {
837
+ i = min(i, i_max);
838
+ }
839
+
840
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
841
+
842
+ const int * scales = (const int *) bxi->scales;
843
+
844
+ const int ksc = k % (WARP_SIZE/8);
845
+
846
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
847
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
848
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
849
+
850
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
851
+ }
852
+ }
853
+
854
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
855
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
856
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
857
+ GGML_UNUSED(x_qh);
858
+
859
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
860
+
861
+ const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
862
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
863
+ x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
864
+ }
865
+
866
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
867
+ GGML_UNUSED(x_qh);
868
+
869
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
870
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
871
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
872
+
873
+ *x_ql = tile_x_ql;
874
+ *x_dm = tile_x_dm;
875
+ *x_sc = tile_x_sc;
876
+ }
877
+
878
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
879
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
880
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
881
+ GGML_UNUSED(x_qh);
882
+
883
+ GGML_CUDA_ASSUME(i_offset >= 0);
884
+ GGML_CUDA_ASSUME(i_offset < nwarps);
885
+ GGML_CUDA_ASSUME(k >= 0);
886
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
887
+
888
+ const int kbx = k / QI5_K; // == 0 if QK_K == 256
889
+ const int kqsx = k % QI5_K; // == k if QK_K == 256
890
+
891
+ const block_q5_K * bx0 = (const block_q5_K *) vx;
892
+
893
+ #pragma unroll
894
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
895
+ int i = i0 + i_offset;
896
+
897
+ if (need_check) {
898
+ i = min(i, i_max);
899
+ }
900
+
901
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
902
+ const int ky = QR5_K*kqsx;
903
+
904
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
905
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
906
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
907
+
908
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
909
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
910
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
911
+
912
+ const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
913
+ const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
914
+
915
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
916
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
917
+ }
918
+
919
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
920
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
921
+
922
+ #pragma unroll
923
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
924
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
925
+
926
+ if (need_check) {
927
+ i = min(i, i_max);
928
+ }
929
+
930
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
931
+
932
+ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
933
+ }
934
+
935
+ #pragma unroll
936
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
937
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
938
+
939
+ if (need_check) {
940
+ i = min(i, i_max);
941
+ }
942
+
943
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
944
+
945
+ const int * scales = (const int *) bxi->scales;
946
+
947
+ const int ksc = k % (WARP_SIZE/8);
948
+
949
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
950
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
951
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
952
+
953
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
954
+ }
955
+ }
956
+
957
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
958
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
959
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
960
+ GGML_UNUSED(x_qh);
961
+
962
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
963
+
964
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
965
+ const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
966
+ return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
967
+ x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
968
+ }
969
+
970
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
971
+ GGML_UNUSED(x_qh);
972
+
973
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
974
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
975
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
976
+
977
+ *x_ql = tile_x_ql;
978
+ *x_dm = tile_x_dm;
979
+ *x_sc = tile_x_sc;
980
+ }
981
+
982
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
983
+ const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
984
+ int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
985
+ GGML_UNUSED(x_qh);
986
+
987
+ GGML_CUDA_ASSUME(i_offset >= 0);
988
+ GGML_CUDA_ASSUME(i_offset < nwarps);
989
+ GGML_CUDA_ASSUME(k >= 0);
990
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
991
+
992
+ const int kbx = k / QI6_K; // == 0 if QK_K == 256
993
+ const int kqsx = k % QI6_K; // == k if QK_K == 256
994
+
995
+ const block_q6_K * bx0 = (const block_q6_K *) vx;
996
+
997
+ #pragma unroll
998
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
999
+ int i = i0 + i_offset;
1000
+
1001
+ if (need_check) {
1002
+ i = min(i, i_max);
1003
+ }
1004
+
1005
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
1006
+ const int ky = QR6_K*kqsx;
1007
+
1008
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
1009
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
1010
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
1011
+
1012
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
1013
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
1014
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
1015
+
1016
+ const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
1017
+ const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
1018
+
1019
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
1020
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
1021
+ }
1022
+
1023
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
1024
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1025
+ float * x_dmf = (float *) x_dm;
1026
+
1027
+ #pragma unroll
1028
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
1029
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
1030
+
1031
+ if (need_check) {
1032
+ i = min(i, i_max);
1033
+ }
1034
+
1035
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
1036
+
1037
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
1038
+ }
1039
+
1040
+ #pragma unroll
1041
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
1042
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
1043
+
1044
+ if (need_check) {
1045
+ i = min(i, i_max);
1046
+ }
1047
+
1048
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
1049
+
1050
+ x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
1051
+ }
1052
+ }
1053
+
1054
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
1055
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1056
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1057
+ GGML_UNUSED(x_qh);
1058
+
1059
+ const float * x_dmf = (const float *) x_dm;
1060
+ const float * y_df = (const float *) y_ds;
1061
+
1062
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
1063
+
1064
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
1065
+ const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
1066
+ return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
1067
+ }
1068
+
1069
+ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
1070
+ allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
1071
+ static __device__ __forceinline__ void mul_mat_q(
1072
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1073
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1074
+
1075
+ const block_q_t * x = (const block_q_t *) vx;
1076
+ const block_q8_1 * y = (const block_q8_1 *) vy;
1077
+
1078
+ const int blocks_per_row_x = ncols_x / qk;
1079
+ const int blocks_per_col_y = nrows_y / QK8_1;
1080
+ const int blocks_per_warp = WARP_SIZE / qi;
1081
+
1082
+ const int & ncols_dst = ncols_y;
1083
+
1084
+ const int row_dst_0 = blockIdx.x*mmq_y;
1085
+ const int & row_x_0 = row_dst_0;
1086
+
1087
+ const int col_dst_0 = blockIdx.y*mmq_x;
1088
+ const int & col_y_0 = col_dst_0;
1089
+
1090
+ int * tile_x_ql = nullptr;
1091
+ half2 * tile_x_dm = nullptr;
1092
+ int * tile_x_qh = nullptr;
1093
+ int * tile_x_sc = nullptr;
1094
+
1095
+ allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
1096
+
1097
+ __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
1098
+ __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
1099
+
1100
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
1101
+
1102
+ for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
1103
+
1104
+ load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
1105
+ threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
1106
+
1107
+ #pragma unroll
1108
+ for (int ir = 0; ir < qr; ++ir) {
1109
+ const int kqs = ir*WARP_SIZE + threadIdx.x;
1110
+ const int kbxd = kqs / QI8_1;
1111
+
1112
+ #pragma unroll
1113
+ for (int i = 0; i < mmq_x; i += nwarps) {
1114
+ const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1115
+
1116
+ const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
1117
+
1118
+ const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
1119
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
1120
+ }
1121
+
1122
+ #pragma unroll
1123
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
1124
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
1125
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
1126
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
1127
+
1128
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
1129
+ const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
1130
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
1131
+ if (need_sum) {
1132
+ *dsi_dst = *dsi_src;
1133
+ } else {
1134
+ float * dfi_dst = (float *) dsi_dst;
1135
+ *dfi_dst = __low2float(*dsi_src);
1136
+ }
1137
+ }
1138
+
1139
+ __syncthreads();
1140
+
1141
+ // #pragma unroll // unrolling this loop causes too much register pressure
1142
+ for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
1143
+ #pragma unroll
1144
+ for (int j = 0; j < mmq_x; j += nwarps) {
1145
+ #pragma unroll
1146
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
1147
+ sum[i/WARP_SIZE][j/nwarps] += vec_dot(
1148
+ tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
1149
+ threadIdx.x + i, threadIdx.y + j, k);
1150
+ }
1151
+ }
1152
+ }
1153
+
1154
+ __syncthreads();
1155
+ }
1156
+ }
1157
+
1158
+ #pragma unroll
1159
+ for (int j = 0; j < mmq_x; j += nwarps) {
1160
+ const int col_dst = col_dst_0 + j + threadIdx.y;
1161
+
1162
+ if (col_dst >= ncols_dst) {
1163
+ return;
1164
+ }
1165
+
1166
+ #pragma unroll
1167
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
1168
+ const int row_dst = row_dst_0 + threadIdx.x + i;
1169
+
1170
+ if (row_dst >= nrows_dst) {
1171
+ continue;
1172
+ }
1173
+
1174
+ dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
1175
+ }
1176
+ }
1177
+ }
1178
+
1179
+ static constexpr __device__ mmq_arch_config_t get_arch_config_device(mmq_config_t mmq_config) {
1180
+
1181
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1182
+
1183
+ #if defined(RDNA3) || defined(RDNA2)
1184
+ return mmq_config.rdna2;
1185
+ #else
1186
+ return mmq_config.rdna1;
1187
+ #endif // defined(RDNA3) || defined(RDNA2)
1188
+
1189
+ #else
1190
+
1191
+ #if __CUDA_ARCH__ >= CC_VOLTA
1192
+ return mmq_config.ampere;
1193
+ #else
1194
+ return mmq_config.pascal;
1195
+ #endif // __CUDA_ARCH__ >= CC_VOLTA
1196
+
1197
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1198
+ }
1199
+
1200
+ template <bool need_check> static __global__ void
1201
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1202
+ #if defined(RDNA3) || defined(RDNA2)
1203
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2)
1204
+ #endif // defined(RDNA3) || defined(RDNA2)
1205
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1206
+ mul_mat_q4_0(
1207
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1208
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1209
+
1210
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1211
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_0);
1212
+
1213
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_0<arch_config.y>,
1214
+ load_tiles_q4_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
1215
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1216
+ #else
1217
+ GGML_UNUSED(get_arch_config_device);
1218
+ GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
1219
+ NO_DEVICE_CODE;
1220
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1221
+ }
1222
+
1223
+ template <bool need_check> static __global__ void
1224
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1225
+ #if defined(RDNA3) || defined(RDNA2)
1226
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.rdna2.nwarps, 2)
1227
+ #endif // defined(RDNA3) || defined(RDNA2)
1228
+ #elif __CUDA_ARCH__ < CC_VOLTA
1229
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.pascal.nwarps, 2)
1230
+ #endif // __CUDA_ARCH__ < CC_VOLTA
1231
+ mul_mat_q4_1(
1232
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1233
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1234
+
1235
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1236
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_1);
1237
+
1238
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_1<arch_config.y>,
1239
+ load_tiles_q4_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
1240
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1241
+ #else
1242
+ GGML_UNUSED(get_arch_config_device);
1243
+ GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
1244
+ NO_DEVICE_CODE;
1245
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1246
+ }
1247
+
1248
+ template <bool need_check> static __global__ void
1249
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1250
+ #if defined(RDNA3) || defined(RDNA2)
1251
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_0.rdna2.nwarps, 2)
1252
+ #endif // defined(RDNA3) || defined(RDNA2)
1253
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1254
+ mul_mat_q5_0(
1255
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1256
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1257
+
1258
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1259
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_0);
1260
+
1261
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_0<arch_config.y>,
1262
+ load_tiles_q5_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
1263
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1264
+ #else
1265
+ GGML_UNUSED(get_arch_config_device);
1266
+ GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
1267
+ NO_DEVICE_CODE;
1268
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1269
+ }
1270
+
1271
+ template <bool need_check> static __global__ void
1272
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1273
+ #if defined(RDNA3) || defined(RDNA2)
1274
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_1.rdna2.nwarps, 2)
1275
+ #endif // defined(RDNA3) || defined(RDNA2)
1276
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1277
+ mul_mat_q5_1(
1278
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1279
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1280
+
1281
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1282
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_1);
1283
+
1284
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_1<arch_config.y>,
1285
+ load_tiles_q5_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
1286
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1287
+ #else
1288
+ GGML_UNUSED(get_arch_config_device);
1289
+ GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
1290
+ NO_DEVICE_CODE;
1291
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1292
+ }
1293
+
1294
+ template <bool need_check> static __global__ void
1295
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1296
+ #if defined(RDNA3) || defined(RDNA2)
1297
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q8_0.rdna2.nwarps, 2)
1298
+ #endif // defined(RDNA3) || defined(RDNA2)
1299
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1300
+ mul_mat_q8_0(
1301
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1302
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1303
+
1304
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1305
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0);
1306
+
1307
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q8_0<arch_config.y>,
1308
+ load_tiles_q8_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
1309
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1310
+ #else
1311
+ GGML_UNUSED(get_arch_config_device);
1312
+ GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
1313
+ NO_DEVICE_CODE;
1314
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1315
+ }
1316
+
1317
+ template <bool need_check> static __global__ void
1318
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1319
+ #if defined(RDNA3) || defined(RDNA2)
1320
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q2_K.rdna2.nwarps, 2)
1321
+ #endif // defined(RDNA3) || defined(RDNA2)
1322
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1323
+ mul_mat_q2_K(
1324
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1325
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1326
+
1327
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1328
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q2_K);
1329
+
1330
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q2_K<arch_config.y>,
1331
+ load_tiles_q2_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
1332
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1333
+ #else
1334
+ GGML_UNUSED(get_arch_config_device);
1335
+ GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
1336
+ NO_DEVICE_CODE;
1337
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1338
+ }
1339
+
1340
+ template <bool need_check> static __global__ void
1341
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1342
+ #if defined(RDNA3) || defined(RDNA2)
1343
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.rdna2.nwarps, 2)
1344
+ #endif // defined(RDNA3) || defined(RDNA2)
1345
+ #elif __CUDA_ARCH__ < CC_VOLTA
1346
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.pascal.nwarps, 2)
1347
+ #endif // __CUDA_ARCH__ < CC_VOLTA
1348
+ mul_mat_q3_K(
1349
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1350
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1351
+
1352
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1353
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q3_K);
1354
+
1355
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q3_K<arch_config.y>,
1356
+ load_tiles_q3_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
1357
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1358
+ #else
1359
+ GGML_UNUSED(get_arch_config_device);
1360
+ GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
1361
+ NO_DEVICE_CODE;
1362
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1363
+ }
1364
+
1365
+ template <bool need_check> static __global__ void
1366
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1367
+ #if defined(RDNA3) || defined(RDNA2)
1368
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.rdna2.nwarps, 2)
1369
+ #endif // defined(RDNA3) || defined(RDNA2)
1370
+ #elif __CUDA_ARCH__ < CC_VOLTA
1371
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1372
+ #endif // __CUDA_ARCH__ < CC_VOLTA
1373
+ mul_mat_q4_K(
1374
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1375
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1376
+
1377
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1378
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_K);
1379
+
1380
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_K<arch_config.y>,
1381
+ load_tiles_q4_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
1382
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1383
+ #else
1384
+ GGML_UNUSED(get_arch_config_device);
1385
+ GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
1386
+ NO_DEVICE_CODE;
1387
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1388
+ }
1389
+
1390
+ template <bool need_check> static __global__ void
1391
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1392
+ #if defined(RDNA3) || defined(RDNA2)
1393
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_K.rdna2.nwarps, 2)
1394
+ #endif // defined(RDNA3) || defined(RDNA2)
1395
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1396
+ mul_mat_q5_K(
1397
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1398
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1399
+
1400
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1401
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_K);
1402
+
1403
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_K<arch_config.y>,
1404
+ load_tiles_q5_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
1405
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1406
+ #else
1407
+ GGML_UNUSED(get_arch_config_device);
1408
+ GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
1409
+ NO_DEVICE_CODE;
1410
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1411
+ }
1412
+
1413
+ template <bool need_check> static __global__ void
1414
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1415
+ #if defined(RDNA3) || defined(RDNA2)
1416
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q6_K.rdna2.nwarps, 2)
1417
+ #endif // defined(RDNA3) || defined(RDNA2)
1418
+ #elif __CUDA_ARCH__ < CC_VOLTA
1419
+ __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1420
+ #endif // __CUDA_ARCH__ < CC_VOLTA
1421
+ mul_mat_q6_K(
1422
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1423
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1424
+
1425
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A
1426
+ constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q6_K);
1427
+
1428
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q6_K<arch_config.y>,
1429
+ load_tiles_q6_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
1430
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1431
+ #else
1432
+ GGML_UNUSED(get_arch_config_device);
1433
+ GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
1434
+ NO_DEVICE_CODE;
1435
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1436
+ }
1437
+
1438
+ #define MMQ_SWITCH_CASE(type_suffix) \
1439
+ case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) { \
1440
+ const bool need_check = false; \
1441
+ mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1442
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1443
+ } else { \
1444
+ const bool need_check = true; \
1445
+ mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1446
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1447
+ } break; \
1448
+
1449
+ void ggml_cuda_op_mul_mat_q(
1450
+ ggml_backend_cuda_context & ctx,
1451
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1452
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1453
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
1454
+
1455
+ const int64_t ne00 = src0->ne[0];
1456
+
1457
+ const int64_t ne10 = src1->ne[0];
1458
+ GGML_ASSERT(ne10 % QK8_1 == 0);
1459
+
1460
+ const int64_t ne0 = dst->ne[0];
1461
+
1462
+ const int64_t row_diff = row_high - row_low;
1463
+
1464
+ int id = ggml_cuda_get_device();
1465
+ const int compute_capability = ggml_cuda_info().devices[id].cc;
1466
+
1467
+ // the main device has a larger memory buffer to hold the results from all GPUs
1468
+ // nrows_dst == nrows of the matrix that the kernel writes into
1469
+ const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
1470
+
1471
+ mmq_config_t mmq_config;
1472
+
1473
+ switch (src0->type) {
1474
+ case GGML_TYPE_Q4_0:
1475
+ mmq_config = MMQ_CONFIG_Q4_0;
1476
+ break;
1477
+ case GGML_TYPE_Q4_1:
1478
+ mmq_config = MMQ_CONFIG_Q4_1;
1479
+ break;
1480
+ case GGML_TYPE_Q5_0:
1481
+ mmq_config = MMQ_CONFIG_Q5_0;
1482
+ break;
1483
+ case GGML_TYPE_Q5_1:
1484
+ mmq_config = MMQ_CONFIG_Q5_1;
1485
+ break;
1486
+ case GGML_TYPE_Q8_0:
1487
+ mmq_config = MMQ_CONFIG_Q8_0;
1488
+ break;
1489
+ case GGML_TYPE_Q2_K:
1490
+ mmq_config = MMQ_CONFIG_Q2_K;
1491
+ break;
1492
+ case GGML_TYPE_Q3_K:
1493
+ mmq_config = MMQ_CONFIG_Q3_K;
1494
+ break;
1495
+ case GGML_TYPE_Q4_K:
1496
+ mmq_config = MMQ_CONFIG_Q4_K;
1497
+ break;
1498
+ case GGML_TYPE_Q5_K:
1499
+ mmq_config = MMQ_CONFIG_Q5_K;
1500
+ break;
1501
+ case GGML_TYPE_Q6_K:
1502
+ mmq_config = MMQ_CONFIG_Q6_K;
1503
+ break;
1504
+ default:
1505
+ GGML_ASSERT(false);
1506
+ break;
1507
+ }
1508
+
1509
+ mmq_arch_config_t arch_config;
1510
+ if (compute_capability >= CC_RDNA2) {
1511
+ arch_config = mmq_config.rdna2;
1512
+ } else if (compute_capability >= CC_OFFSET_AMD) {
1513
+ arch_config = mmq_config.rdna1;
1514
+ } else if (compute_capability >= CC_VOLTA) {
1515
+ arch_config = mmq_config.ampere;
1516
+ } else if (compute_capability >= MIN_CC_DP4A) {
1517
+ arch_config = mmq_config.pascal;
1518
+ } else {
1519
+ GGML_ASSERT(false);
1520
+ }
1521
+
1522
+ const int block_num_x = (row_diff + arch_config.y - 1) / arch_config.y;
1523
+ const int block_num_y = (src1_ncols + arch_config.x - 1) / arch_config.x;
1524
+ const dim3 block_nums(block_num_x, block_num_y, 1);
1525
+ const dim3 block_dims(WARP_SIZE, arch_config.nwarps, 1);
1526
+
1527
+ switch (src0->type) {
1528
+ MMQ_SWITCH_CASE(4_0)
1529
+ MMQ_SWITCH_CASE(4_1)
1530
+ MMQ_SWITCH_CASE(5_0)
1531
+ MMQ_SWITCH_CASE(5_1)
1532
+ MMQ_SWITCH_CASE(8_0)
1533
+ MMQ_SWITCH_CASE(2_K)
1534
+ MMQ_SWITCH_CASE(3_K)
1535
+ MMQ_SWITCH_CASE(4_K)
1536
+ MMQ_SWITCH_CASE(5_K)
1537
+ MMQ_SWITCH_CASE(6_K)
1538
+ default:
1539
+ GGML_ASSERT(false);
1540
+ break;
1541
+ }
1542
+
1543
+ GGML_UNUSED(src1);
1544
+ GGML_UNUSED(dst);
1545
+ GGML_UNUSED(src1_ddf_i);
1546
+ }
1547
+
1548
+ bool ggml_cuda_supports_mmq(enum ggml_type type) {
1549
+ switch (type) {
1550
+ case GGML_TYPE_Q4_0:
1551
+ case GGML_TYPE_Q4_1:
1552
+ case GGML_TYPE_Q5_0:
1553
+ case GGML_TYPE_Q5_1:
1554
+ case GGML_TYPE_Q8_0:
1555
+ case GGML_TYPE_Q2_K:
1556
+ case GGML_TYPE_Q3_K:
1557
+ case GGML_TYPE_Q4_K:
1558
+ case GGML_TYPE_Q5_K:
1559
+ case GGML_TYPE_Q6_K:
1560
+ return true;
1561
+ default:
1562
+ return false;
1563
+ }
1564
+ }