llama_cpp 0.16.0 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/ext/llama_cpp/extconf.rb +2 -0
  4. data/ext/llama_cpp/llama_cpp.cpp +2 -0
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +2 -0
  7. data/vendor/tmp/llama.cpp/Makefile +110 -53
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
  17. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
  18. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
  19. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
  20. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
  21. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
  23. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
  24. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
  25. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
  26. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
  27. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
  28. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
  29. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
  30. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
  31. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
  32. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
  33. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
  34. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
  35. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
  125. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
  126. data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
  127. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
  128. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
  129. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
  130. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
  131. data/vendor/tmp/llama.cpp/ggml.c +102 -275
  132. data/vendor/tmp/llama.cpp/llama.cpp +103 -47
  133. data/vendor/tmp/llama.cpp/llama.h +4 -0
  134. metadata +15 -3
@@ -1,1450 +1,4 @@
1
1
  #include "mmq.cuh"
2
- #include "vecdotq.cuh"
3
-
4
- typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
5
- typedef void (*load_tiles_cuda_t)(
6
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
7
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
8
- typedef float (*vec_dot_q_mul_mat_cuda_t)(
9
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
10
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
11
- typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
12
- typedef void (mul_mat_q_t)(
13
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
14
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst);
15
-
16
- struct mmq_arch_config_t {
17
- int x;
18
- int y;
19
- int nwarps;
20
- };
21
-
22
- struct mmq_config_t {
23
- mmq_arch_config_t rdna2;
24
- mmq_arch_config_t rdna1;
25
- mmq_arch_config_t ampere;
26
- mmq_arch_config_t pascal;
27
- };
28
-
29
- constexpr mmq_config_t MMQ_CONFIG_Q4_0 = {
30
- // x y nwarps
31
- { 64, 128, 8},
32
- { 64, 64, 8},
33
- #ifdef CUDA_USE_TENSOR_CORES
34
- { 4, 32, 4},
35
- #else
36
- { 64, 128, 4},
37
- #endif // CUDA_USE_TENSOR_CORES
38
- { 64, 64, 8},
39
- };
40
- constexpr mmq_config_t MMQ_CONFIG_Q4_1 = {
41
- // x y nwarps
42
- { 64, 128, 8},
43
- { 64, 64, 8},
44
- #ifdef CUDA_USE_TENSOR_CORES
45
- { 4, 32, 4},
46
- #else
47
- { 64, 128, 4},
48
- #endif // CUDA_USE_TENSOR_CORES
49
- { 64, 64, 8},
50
- };
51
- constexpr mmq_config_t MMQ_CONFIG_Q5_0 = {
52
- // x y nwarps
53
- { 64, 128, 8},
54
- { 64, 64, 8},
55
- #ifdef CUDA_USE_TENSOR_CORES
56
- { 4, 32, 4},
57
- #else
58
- {128, 64, 4},
59
- #endif // CUDA_USE_TENSOR_CORES
60
- { 64, 64, 8},
61
- };
62
- constexpr mmq_config_t MMQ_CONFIG_Q5_1 = {
63
- // x y nwarps
64
- { 64, 128, 8},
65
- { 64, 64, 8},
66
- #ifdef CUDA_USE_TENSOR_CORES
67
- { 4, 32, 4},
68
- #else
69
- {128, 64, 4},
70
- #endif // CUDA_USE_TENSOR_CORES
71
- { 64, 64, 8},
72
- };
73
- constexpr mmq_config_t MMQ_CONFIG_Q8_0 = {
74
- // x y nwarps
75
- { 64, 128, 8},
76
- { 64, 64, 8},
77
- #ifdef CUDA_USE_TENSOR_CORES
78
- { 4, 32, 4},
79
- #else
80
- {128, 64, 4},
81
- #endif // CUDA_USE_TENSOR_CORES
82
- { 64, 64, 8},
83
- };
84
- constexpr mmq_config_t MMQ_CONFIG_Q2_K = {
85
- // x y nwarps
86
- { 64, 128, 8},
87
- {128, 32, 8},
88
- #ifdef CUDA_USE_TENSOR_CORES
89
- { 4, 32, 4},
90
- #else
91
- { 64, 128, 4},
92
- #endif // CUDA_USE_TENSOR_CORES
93
- { 64, 64, 8},
94
- };
95
- constexpr mmq_config_t MMQ_CONFIG_Q3_K = {
96
- // x y nwarps
97
- {128, 64, 8},
98
- { 32, 128, 8},
99
- #ifdef CUDA_USE_TENSOR_CORES
100
- { 4, 32, 4},
101
- #else
102
- {128, 128, 4},
103
- #endif // CUDA_USE_TENSOR_CORES
104
- { 64, 64, 8},
105
- };
106
- constexpr mmq_config_t MMQ_CONFIG_Q4_K = {
107
- // x y nwarps
108
- { 64, 128, 8},
109
- { 32, 64, 8},
110
- #ifdef CUDA_USE_TENSOR_CORES
111
- { 4, 32, 4},
112
- #else
113
- { 64, 128, 4},
114
- #endif // CUDA_USE_TENSOR_CORES
115
- { 64, 64, 8},
116
- };
117
- constexpr mmq_config_t MMQ_CONFIG_Q5_K = {
118
- // x y nwarps
119
- { 64, 128, 8},
120
- { 32, 64, 8},
121
- #ifdef CUDA_USE_TENSOR_CORES
122
- { 4, 32, 4},
123
- #else
124
- { 64, 128, 4},
125
- #endif // CUDA_USE_TENSOR_CORES
126
- { 64, 64, 8},
127
- };
128
- constexpr mmq_config_t MMQ_CONFIG_Q6_K = {
129
- // x y nwarps
130
- { 64, 128, 8},
131
- { 32, 64, 8},
132
- #ifdef CUDA_USE_TENSOR_CORES
133
- { 4, 32, 4},
134
- #else
135
- { 64, 64, 4},
136
- #endif // CUDA_USE_TENSOR_CORES
137
- { 64, 64, 8},
138
- };
139
-
140
- // ------------------------------------------------------------
141
-
142
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
143
- GGML_UNUSED(x_qh);
144
- GGML_UNUSED(x_sc);
145
-
146
- __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
147
- __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
148
-
149
- *x_ql = tile_x_qs;
150
- *x_dm = (half2 *) tile_x_d;
151
- }
152
-
153
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
154
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
155
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
156
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
157
- GGML_CUDA_ASSUME(i_offset >= 0);
158
- GGML_CUDA_ASSUME(i_offset < nwarps);
159
- GGML_CUDA_ASSUME(k >= 0);
160
- GGML_CUDA_ASSUME(k < WARP_SIZE);
161
-
162
- const int kbx = k / QI4_0;
163
- const int kqsx = k % QI4_0;
164
-
165
- const block_q4_0 * bx0 = (const block_q4_0 *) vx;
166
-
167
- float * x_dmf = (float *) x_dm;
168
-
169
- #pragma unroll
170
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
171
- int i = i0 + i_offset;
172
-
173
- if (need_check) {
174
- i = min(i, i_max);
175
- }
176
-
177
- const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
178
-
179
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
180
- // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
181
- }
182
-
183
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
184
- const int kbxd = k % blocks_per_tile_x_row;
185
-
186
- #pragma unroll
187
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
188
- int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
189
-
190
- if (need_check) {
191
- i = min(i, i_max);
192
- }
193
-
194
- const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
195
-
196
- x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
197
- }
198
- }
199
-
200
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
201
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
202
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
203
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
204
-
205
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
206
- const float * x_dmf = (const float *) x_dm;
207
-
208
- int u[2*VDR_Q4_0_Q8_1_MMQ];
209
-
210
- #pragma unroll
211
- for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
212
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
213
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
214
- }
215
-
216
- return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
217
- (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
218
- y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
219
- }
220
-
221
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
222
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
223
-
224
- __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
225
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
226
-
227
- *x_ql = tile_x_qs;
228
- *x_dm = tile_x_dm;
229
- }
230
-
231
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
232
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
233
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
234
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
235
-
236
- GGML_CUDA_ASSUME(i_offset >= 0);
237
- GGML_CUDA_ASSUME(i_offset < nwarps);
238
- GGML_CUDA_ASSUME(k >= 0);
239
- GGML_CUDA_ASSUME(k < WARP_SIZE);
240
-
241
- const int kbx = k / QI4_1;
242
- const int kqsx = k % QI4_1;
243
-
244
- const block_q4_1 * bx0 = (const block_q4_1 *) vx;
245
-
246
- #pragma unroll
247
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
248
- int i = i0 + i_offset;
249
-
250
- if (need_check) {
251
- i = min(i, i_max);
252
- }
253
-
254
- const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
255
-
256
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
257
- }
258
-
259
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
260
- const int kbxd = k % blocks_per_tile_x_row;
261
-
262
- #pragma unroll
263
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
264
- int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
265
-
266
- if (need_check) {
267
- i = min(i, i_max);
268
- }
269
-
270
- const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
271
-
272
- x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
273
- }
274
- }
275
-
276
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
277
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
278
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
279
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
280
-
281
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
282
-
283
- int u[2*VDR_Q4_1_Q8_1_MMQ];
284
-
285
- #pragma unroll
286
- for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
287
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
288
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
289
- }
290
-
291
- return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
292
- (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
293
- y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
294
- }
295
-
296
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
297
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
298
-
299
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
300
- __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
301
-
302
- *x_ql = tile_x_ql;
303
- *x_dm = (half2 *) tile_x_d;
304
- }
305
-
306
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
307
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
308
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
309
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
310
-
311
- GGML_CUDA_ASSUME(i_offset >= 0);
312
- GGML_CUDA_ASSUME(i_offset < nwarps);
313
- GGML_CUDA_ASSUME(k >= 0);
314
- GGML_CUDA_ASSUME(k < WARP_SIZE);
315
-
316
- const int kbx = k / QI5_0;
317
- const int kqsx = k % QI5_0;
318
-
319
- const block_q5_0 * bx0 = (const block_q5_0 *) vx;
320
-
321
- #pragma unroll
322
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
323
- int i = i0 + i_offset;
324
-
325
- if (need_check) {
326
- i = min(i, i_max);
327
- }
328
-
329
- const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
330
-
331
- const int ql = get_int_from_uint8(bxi->qs, kqsx);
332
- const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
333
-
334
- int qs0 = (ql >> 0) & 0x0F0F0F0F;
335
- qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
336
- qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
337
- qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
338
- qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
339
- qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
340
-
341
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
342
-
343
- int qs1 = (ql >> 4) & 0x0F0F0F0F;
344
- qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
345
- qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
346
- qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
347
- qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
348
- qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
349
-
350
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
351
- }
352
-
353
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
354
- const int kbxd = k % blocks_per_tile_x_row;
355
- float * x_dmf = (float *) x_dm;
356
-
357
- #pragma unroll
358
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
359
- int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
360
-
361
- if (need_check) {
362
- i = min(i, i_max);
363
- }
364
-
365
- const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
366
-
367
- x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
368
- }
369
- }
370
-
371
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
372
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
373
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
374
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
375
-
376
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
377
- const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
378
- const float * x_dmf = (const float *) x_dm;
379
- const float * y_df = (const float *) y_ds;
380
-
381
- int u[2*VDR_Q5_0_Q8_1_MMQ];
382
-
383
- #pragma unroll
384
- for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
385
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
386
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
387
- }
388
-
389
- return vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
390
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
391
- }
392
-
393
-
394
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
395
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
396
-
397
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
398
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
399
-
400
- *x_ql = tile_x_ql;
401
- *x_dm = tile_x_dm;
402
- }
403
-
404
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
405
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
406
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
407
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
408
-
409
- GGML_CUDA_ASSUME(i_offset >= 0);
410
- GGML_CUDA_ASSUME(i_offset < nwarps);
411
- GGML_CUDA_ASSUME(k >= 0);
412
- GGML_CUDA_ASSUME(k < WARP_SIZE);
413
-
414
- const int kbx = k / QI5_1;
415
- const int kqsx = k % QI5_1;
416
-
417
- const block_q5_1 * bx0 = (const block_q5_1 *) vx;
418
-
419
- #pragma unroll
420
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
421
- int i = i0 + i_offset;
422
-
423
- if (need_check) {
424
- i = min(i, i_max);
425
- }
426
-
427
- const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
428
-
429
- const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
430
- const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
431
-
432
- int qs0 = (ql >> 0) & 0x0F0F0F0F;
433
- qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
434
- qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
435
- qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
436
- qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
437
-
438
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
439
-
440
- int qs1 = (ql >> 4) & 0x0F0F0F0F;
441
- qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
442
- qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
443
- qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
444
- qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
445
-
446
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
447
- }
448
-
449
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
450
- const int kbxd = k % blocks_per_tile_x_row;
451
-
452
- #pragma unroll
453
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
454
- int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
455
-
456
- if (need_check) {
457
- i = min(i, i_max);
458
- }
459
-
460
- const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
461
-
462
- x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
463
- }
464
- }
465
-
466
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
467
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
468
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
469
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
470
-
471
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
472
- const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
473
-
474
- int u[2*VDR_Q5_1_Q8_1_MMQ];
475
-
476
- #pragma unroll
477
- for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
478
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
479
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
480
- }
481
-
482
- return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
483
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
484
- }
485
-
486
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
487
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
488
-
489
- __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
490
- __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
491
-
492
- *x_ql = tile_x_qs;
493
- *x_dm = (half2 *) tile_x_d;
494
- }
495
-
496
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
497
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
498
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
499
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
500
-
501
- GGML_CUDA_ASSUME(i_offset >= 0);
502
- GGML_CUDA_ASSUME(i_offset < nwarps);
503
- GGML_CUDA_ASSUME(k >= 0);
504
- GGML_CUDA_ASSUME(k < WARP_SIZE);
505
-
506
- const int kbx = k / QI8_0;
507
- const int kqsx = k % QI8_0;
508
- float * x_dmf = (float *) x_dm;
509
-
510
- const block_q8_0 * bx0 = (const block_q8_0 *) vx;
511
-
512
- #pragma unroll
513
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
514
- int i = i0 + i_offset;
515
-
516
- if (need_check) {
517
- i = min(i, i_max);
518
- }
519
-
520
- const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
521
-
522
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
523
- }
524
-
525
- const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
526
- const int kbxd = k % blocks_per_tile_x_row;
527
-
528
- #pragma unroll
529
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
530
- int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
531
-
532
- if (need_check) {
533
- i = min(i, i_max);
534
- }
535
-
536
- const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
537
-
538
- x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
539
- }
540
- }
541
-
542
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
543
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
544
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
545
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
546
-
547
- const float * x_dmf = (const float *) x_dm;
548
- const float * y_df = (const float *) y_ds;
549
-
550
- return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
551
- (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
552
- y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
553
- }
554
-
555
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
556
- GGML_UNUSED(x_qh);
557
-
558
- __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
559
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
560
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
561
-
562
- *x_ql = tile_x_ql;
563
- *x_dm = tile_x_dm;
564
- *x_sc = tile_x_sc;
565
- }
566
-
567
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
568
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
569
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
570
- GGML_UNUSED(x_qh);
571
-
572
- GGML_CUDA_ASSUME(i_offset >= 0);
573
- GGML_CUDA_ASSUME(i_offset < nwarps);
574
- GGML_CUDA_ASSUME(k >= 0);
575
- GGML_CUDA_ASSUME(k < WARP_SIZE);
576
-
577
- const int kbx = k / QI2_K;
578
- const int kqsx = k % QI2_K;
579
-
580
- const block_q2_K * bx0 = (const block_q2_K *) vx;
581
-
582
- #pragma unroll
583
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
584
- int i = i0 + i_offset;
585
-
586
- if (need_check) {
587
- i = min(i, i_max);
588
- }
589
-
590
- const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
591
-
592
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
593
- }
594
-
595
- const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
596
- const int kbxd = k % blocks_per_tile_x_row;
597
-
598
- #pragma unroll
599
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
600
- int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
601
-
602
- if (need_check) {
603
- i = min(i, i_max);
604
- }
605
-
606
- const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
607
-
608
- x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
609
- }
610
-
611
- #pragma unroll
612
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
613
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
614
-
615
- if (need_check) {
616
- i = min(i, i_max);
617
- }
618
-
619
- const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
620
-
621
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
622
- }
623
- }
624
-
625
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
626
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
627
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
628
- GGML_UNUSED(x_qh);
629
-
630
- const int kbx = k / QI2_K;
631
- const int ky = (k % QI2_K) * QR2_K;
632
- const float * y_df = (const float *) y_ds;
633
-
634
- int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
635
-
636
- const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
637
- const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
638
-
639
- #pragma unroll
640
- for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
641
- v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
642
- }
643
-
644
- const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
645
-
646
- const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
647
- return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
648
- }
649
-
650
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
651
-
652
- __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
653
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
654
- __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
655
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
656
-
657
- *x_ql = tile_x_ql;
658
- *x_dm = tile_x_dm;
659
- *x_qh = tile_x_qh;
660
- *x_sc = tile_x_sc;
661
- }
662
-
663
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
664
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
665
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
666
-
667
- GGML_CUDA_ASSUME(i_offset >= 0);
668
- GGML_CUDA_ASSUME(i_offset < nwarps);
669
- GGML_CUDA_ASSUME(k >= 0);
670
- GGML_CUDA_ASSUME(k < WARP_SIZE);
671
-
672
- const int kbx = k / QI3_K;
673
- const int kqsx = k % QI3_K;
674
-
675
- const block_q3_K * bx0 = (const block_q3_K *) vx;
676
-
677
- #pragma unroll
678
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
679
- int i = i0 + i_offset;
680
-
681
- if (need_check) {
682
- i = min(i, i_max);
683
- }
684
-
685
- const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
686
-
687
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
688
- }
689
-
690
- const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
691
- const int kbxd = k % blocks_per_tile_x_row;
692
- float * x_dmf = (float *) x_dm;
693
-
694
- #pragma unroll
695
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
696
- int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
697
-
698
- if (need_check) {
699
- i = min(i, i_max);
700
- }
701
-
702
- const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
703
-
704
- x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
705
- }
706
-
707
- #pragma unroll
708
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
709
- int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
710
-
711
- if (need_check) {
712
- i = min(i, i_max);
713
- }
714
-
715
- const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
716
-
717
- // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
718
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
719
- }
720
-
721
- #pragma unroll
722
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
723
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
724
-
725
- if (need_check) {
726
- i = min(i, i_max);
727
- }
728
-
729
- const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
730
-
731
- const int ksc = k % (QI3_K/4);
732
-
733
- const int ksc_low = ksc % (QI3_K/8);
734
- const int shift_low = 4 * (ksc / (QI3_K/8));
735
- const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
736
-
737
- const int ksc_high = QI3_K/8;
738
- const int shift_high = 2 * ksc;
739
- const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
740
-
741
- const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
742
-
743
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
744
- }
745
- }
746
-
747
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
748
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
749
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
750
-
751
- const int kbx = k / QI3_K;
752
- const int ky = (k % QI3_K) * QR3_K;
753
- const float * x_dmf = (const float *) x_dm;
754
- const float * y_df = (const float *) y_ds;
755
-
756
- const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
757
-
758
- int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
759
-
760
- #pragma unroll
761
- for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
762
- const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
763
- const int shift = 2 * ((ky % 32) / 8);
764
- const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
765
-
766
- const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
767
- const int vlh = (vh << 2) & 0x04040404;
768
-
769
- v[l] = __vsubss4(vll, vlh);
770
- }
771
-
772
- const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
773
- return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
774
- }
775
-
776
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
777
- GGML_UNUSED(x_qh);
778
-
779
- __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
780
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
781
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
782
-
783
- *x_ql = tile_x_ql;
784
- *x_dm = tile_x_dm;
785
- *x_sc = tile_x_sc;
786
- }
787
-
788
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
789
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
790
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
791
- GGML_UNUSED(x_qh);
792
-
793
- GGML_CUDA_ASSUME(i_offset >= 0);
794
- GGML_CUDA_ASSUME(i_offset < nwarps);
795
- GGML_CUDA_ASSUME(k >= 0);
796
- GGML_CUDA_ASSUME(k < WARP_SIZE);
797
-
798
- const int kbx = k / QI4_K; // == 0 if QK_K == 256
799
- const int kqsx = k % QI4_K; // == k if QK_K == 256
800
-
801
- const block_q4_K * bx0 = (const block_q4_K *) vx;
802
-
803
- #pragma unroll
804
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
805
- int i = i0 + i_offset;
806
-
807
- if (need_check) {
808
- i = min(i, i_max);
809
- }
810
-
811
- const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
812
-
813
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
814
- }
815
-
816
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
817
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
818
-
819
- #pragma unroll
820
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
821
- int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
822
-
823
- if (need_check) {
824
- i = min(i, i_max);
825
- }
826
-
827
- const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
828
-
829
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
830
- }
831
-
832
- #pragma unroll
833
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
834
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
835
-
836
- if (need_check) {
837
- i = min(i, i_max);
838
- }
839
-
840
- const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
841
-
842
- const int * scales = (const int *) bxi->scales;
843
-
844
- const int ksc = k % (WARP_SIZE/8);
845
-
846
- // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
847
- int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
848
- scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
849
-
850
- x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
851
- }
852
- }
853
-
854
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
855
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
856
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
857
- GGML_UNUSED(x_qh);
858
-
859
- const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
860
-
861
- const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
862
- return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
863
- x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
864
- }
865
-
866
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
867
- GGML_UNUSED(x_qh);
868
-
869
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
870
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
871
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
872
-
873
- *x_ql = tile_x_ql;
874
- *x_dm = tile_x_dm;
875
- *x_sc = tile_x_sc;
876
- }
877
-
878
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
879
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
880
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
881
- GGML_UNUSED(x_qh);
882
-
883
- GGML_CUDA_ASSUME(i_offset >= 0);
884
- GGML_CUDA_ASSUME(i_offset < nwarps);
885
- GGML_CUDA_ASSUME(k >= 0);
886
- GGML_CUDA_ASSUME(k < WARP_SIZE);
887
-
888
- const int kbx = k / QI5_K; // == 0 if QK_K == 256
889
- const int kqsx = k % QI5_K; // == k if QK_K == 256
890
-
891
- const block_q5_K * bx0 = (const block_q5_K *) vx;
892
-
893
- #pragma unroll
894
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
895
- int i = i0 + i_offset;
896
-
897
- if (need_check) {
898
- i = min(i, i_max);
899
- }
900
-
901
- const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
902
- const int ky = QR5_K*kqsx;
903
-
904
- const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
905
- const int ql0 = (ql >> 0) & 0x0F0F0F0F;
906
- const int ql1 = (ql >> 4) & 0x0F0F0F0F;
907
-
908
- const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
909
- const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
910
- const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
911
-
912
- const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
913
- const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
914
-
915
- x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
916
- x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
917
- }
918
-
919
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
920
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
921
-
922
- #pragma unroll
923
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
924
- int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
925
-
926
- if (need_check) {
927
- i = min(i, i_max);
928
- }
929
-
930
- const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
931
-
932
- x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
933
- }
934
-
935
- #pragma unroll
936
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
937
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
938
-
939
- if (need_check) {
940
- i = min(i, i_max);
941
- }
942
-
943
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
944
-
945
- const int * scales = (const int *) bxi->scales;
946
-
947
- const int ksc = k % (WARP_SIZE/8);
948
-
949
- // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
950
- int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
951
- scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
952
-
953
- x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
954
- }
955
- }
956
-
957
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
958
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
959
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
960
- GGML_UNUSED(x_qh);
961
-
962
- const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
963
-
964
- const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
965
- const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
966
- return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
967
- x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
968
- }
969
-
970
- template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
971
- GGML_UNUSED(x_qh);
972
-
973
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
974
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
975
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
976
-
977
- *x_ql = tile_x_ql;
978
- *x_dm = tile_x_dm;
979
- *x_sc = tile_x_sc;
980
- }
981
-
982
- template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
983
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
984
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
985
- GGML_UNUSED(x_qh);
986
-
987
- GGML_CUDA_ASSUME(i_offset >= 0);
988
- GGML_CUDA_ASSUME(i_offset < nwarps);
989
- GGML_CUDA_ASSUME(k >= 0);
990
- GGML_CUDA_ASSUME(k < WARP_SIZE);
991
-
992
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
993
- const int kqsx = k % QI6_K; // == k if QK_K == 256
994
-
995
- const block_q6_K * bx0 = (const block_q6_K *) vx;
996
-
997
- #pragma unroll
998
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
999
- int i = i0 + i_offset;
1000
-
1001
- if (need_check) {
1002
- i = min(i, i_max);
1003
- }
1004
-
1005
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
1006
- const int ky = QR6_K*kqsx;
1007
-
1008
- const int ql = get_int_from_uint8(bxi->ql, kqsx);
1009
- const int ql0 = (ql >> 0) & 0x0F0F0F0F;
1010
- const int ql1 = (ql >> 4) & 0x0F0F0F0F;
1011
-
1012
- const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
1013
- const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
1014
- const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
1015
-
1016
- const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
1017
- const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
1018
-
1019
- x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
1020
- x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
1021
- }
1022
-
1023
- const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
1024
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
1025
- float * x_dmf = (float *) x_dm;
1026
-
1027
- #pragma unroll
1028
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
1029
- int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
1030
-
1031
- if (need_check) {
1032
- i = min(i, i_max);
1033
- }
1034
-
1035
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
1036
-
1037
- x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
1038
- }
1039
-
1040
- #pragma unroll
1041
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
1042
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
1043
-
1044
- if (need_check) {
1045
- i = min(i, i_max);
1046
- }
1047
-
1048
- const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
1049
-
1050
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
1051
- }
1052
- }
1053
-
1054
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
1055
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1056
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1057
- GGML_UNUSED(x_qh);
1058
-
1059
- const float * x_dmf = (const float *) x_dm;
1060
- const float * y_df = (const float *) y_ds;
1061
-
1062
- const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
1063
-
1064
- const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
1065
- const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
1066
- return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
1067
- }
1068
-
1069
- template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
1070
- allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
1071
- static __device__ __forceinline__ void mul_mat_q(
1072
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1073
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1074
-
1075
- const block_q_t * x = (const block_q_t *) vx;
1076
- const block_q8_1 * y = (const block_q8_1 *) vy;
1077
-
1078
- const int blocks_per_row_x = ncols_x / qk;
1079
- const int blocks_per_col_y = nrows_y / QK8_1;
1080
- const int blocks_per_warp = WARP_SIZE / qi;
1081
-
1082
- const int & ncols_dst = ncols_y;
1083
-
1084
- const int row_dst_0 = blockIdx.x*mmq_y;
1085
- const int & row_x_0 = row_dst_0;
1086
-
1087
- const int col_dst_0 = blockIdx.y*mmq_x;
1088
- const int & col_y_0 = col_dst_0;
1089
-
1090
- int * tile_x_ql = nullptr;
1091
- half2 * tile_x_dm = nullptr;
1092
- int * tile_x_qh = nullptr;
1093
- int * tile_x_sc = nullptr;
1094
-
1095
- allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
1096
-
1097
- __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
1098
- __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
1099
-
1100
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
1101
-
1102
- for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
1103
-
1104
- load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
1105
- threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
1106
-
1107
- #pragma unroll
1108
- for (int ir = 0; ir < qr; ++ir) {
1109
- const int kqs = ir*WARP_SIZE + threadIdx.x;
1110
- const int kbxd = kqs / QI8_1;
1111
-
1112
- #pragma unroll
1113
- for (int i = 0; i < mmq_x; i += nwarps) {
1114
- const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
1115
-
1116
- const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
1117
-
1118
- const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
1119
- tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
1120
- }
1121
-
1122
- #pragma unroll
1123
- for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
1124
- const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
1125
- const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
1126
- const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
1127
-
1128
- // if the sum is not needed it's faster to transform the scale to f32 ahead of time
1129
- const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
1130
- half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
1131
- if (need_sum) {
1132
- *dsi_dst = *dsi_src;
1133
- } else {
1134
- float * dfi_dst = (float *) dsi_dst;
1135
- *dfi_dst = __low2float(*dsi_src);
1136
- }
1137
- }
1138
-
1139
- __syncthreads();
1140
-
1141
- // #pragma unroll // unrolling this loop causes too much register pressure
1142
- for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
1143
- #pragma unroll
1144
- for (int j = 0; j < mmq_x; j += nwarps) {
1145
- #pragma unroll
1146
- for (int i = 0; i < mmq_y; i += WARP_SIZE) {
1147
- sum[i/WARP_SIZE][j/nwarps] += vec_dot(
1148
- tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
1149
- threadIdx.x + i, threadIdx.y + j, k);
1150
- }
1151
- }
1152
- }
1153
-
1154
- __syncthreads();
1155
- }
1156
- }
1157
-
1158
- #pragma unroll
1159
- for (int j = 0; j < mmq_x; j += nwarps) {
1160
- const int col_dst = col_dst_0 + j + threadIdx.y;
1161
-
1162
- if (col_dst >= ncols_dst) {
1163
- return;
1164
- }
1165
-
1166
- #pragma unroll
1167
- for (int i = 0; i < mmq_y; i += WARP_SIZE) {
1168
- const int row_dst = row_dst_0 + threadIdx.x + i;
1169
-
1170
- if (row_dst >= nrows_dst) {
1171
- continue;
1172
- }
1173
-
1174
- dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
1175
- }
1176
- }
1177
- }
1178
-
1179
- static constexpr __device__ mmq_arch_config_t get_arch_config_device(mmq_config_t mmq_config) {
1180
-
1181
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1182
-
1183
- #if defined(RDNA3) || defined(RDNA2)
1184
- return mmq_config.rdna2;
1185
- #else
1186
- return mmq_config.rdna1;
1187
- #endif // defined(RDNA3) || defined(RDNA2)
1188
-
1189
- #else
1190
-
1191
- #if __CUDA_ARCH__ >= CC_VOLTA
1192
- return mmq_config.ampere;
1193
- #else
1194
- return mmq_config.pascal;
1195
- #endif // __CUDA_ARCH__ >= CC_VOLTA
1196
-
1197
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1198
- }
1199
-
1200
- template <bool need_check> static __global__ void
1201
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1202
- #if defined(RDNA3) || defined(RDNA2)
1203
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2)
1204
- #endif // defined(RDNA3) || defined(RDNA2)
1205
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1206
- mul_mat_q4_0(
1207
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1208
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1209
-
1210
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1211
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_0);
1212
-
1213
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_0<arch_config.y>,
1214
- load_tiles_q4_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
1215
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1216
- #else
1217
- GGML_UNUSED(get_arch_config_device);
1218
- GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
1219
- NO_DEVICE_CODE;
1220
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1221
- }
1222
-
1223
- template <bool need_check> static __global__ void
1224
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1225
- #if defined(RDNA3) || defined(RDNA2)
1226
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.rdna2.nwarps, 2)
1227
- #endif // defined(RDNA3) || defined(RDNA2)
1228
- #elif __CUDA_ARCH__ < CC_VOLTA
1229
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.pascal.nwarps, 2)
1230
- #endif // __CUDA_ARCH__ < CC_VOLTA
1231
- mul_mat_q4_1(
1232
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1233
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1234
-
1235
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1236
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_1);
1237
-
1238
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_1<arch_config.y>,
1239
- load_tiles_q4_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
1240
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1241
- #else
1242
- GGML_UNUSED(get_arch_config_device);
1243
- GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
1244
- NO_DEVICE_CODE;
1245
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1246
- }
1247
-
1248
- template <bool need_check> static __global__ void
1249
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1250
- #if defined(RDNA3) || defined(RDNA2)
1251
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_0.rdna2.nwarps, 2)
1252
- #endif // defined(RDNA3) || defined(RDNA2)
1253
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1254
- mul_mat_q5_0(
1255
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1256
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1257
-
1258
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1259
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_0);
1260
-
1261
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_0<arch_config.y>,
1262
- load_tiles_q5_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
1263
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1264
- #else
1265
- GGML_UNUSED(get_arch_config_device);
1266
- GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
1267
- NO_DEVICE_CODE;
1268
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1269
- }
1270
-
1271
- template <bool need_check> static __global__ void
1272
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1273
- #if defined(RDNA3) || defined(RDNA2)
1274
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_1.rdna2.nwarps, 2)
1275
- #endif // defined(RDNA3) || defined(RDNA2)
1276
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1277
- mul_mat_q5_1(
1278
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1279
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1280
-
1281
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1282
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_1);
1283
-
1284
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_1<arch_config.y>,
1285
- load_tiles_q5_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
1286
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1287
- #else
1288
- GGML_UNUSED(get_arch_config_device);
1289
- GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
1290
- NO_DEVICE_CODE;
1291
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1292
- }
1293
-
1294
- template <bool need_check> static __global__ void
1295
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1296
- #if defined(RDNA3) || defined(RDNA2)
1297
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q8_0.rdna2.nwarps, 2)
1298
- #endif // defined(RDNA3) || defined(RDNA2)
1299
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1300
- mul_mat_q8_0(
1301
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1302
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1303
-
1304
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1305
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0);
1306
-
1307
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q8_0<arch_config.y>,
1308
- load_tiles_q8_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
1309
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1310
- #else
1311
- GGML_UNUSED(get_arch_config_device);
1312
- GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
1313
- NO_DEVICE_CODE;
1314
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1315
- }
1316
-
1317
- template <bool need_check> static __global__ void
1318
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1319
- #if defined(RDNA3) || defined(RDNA2)
1320
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q2_K.rdna2.nwarps, 2)
1321
- #endif // defined(RDNA3) || defined(RDNA2)
1322
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1323
- mul_mat_q2_K(
1324
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1325
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1326
-
1327
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1328
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q2_K);
1329
-
1330
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q2_K<arch_config.y>,
1331
- load_tiles_q2_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
1332
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1333
- #else
1334
- GGML_UNUSED(get_arch_config_device);
1335
- GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
1336
- NO_DEVICE_CODE;
1337
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1338
- }
1339
-
1340
- template <bool need_check> static __global__ void
1341
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1342
- #if defined(RDNA3) || defined(RDNA2)
1343
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.rdna2.nwarps, 2)
1344
- #endif // defined(RDNA3) || defined(RDNA2)
1345
- #elif __CUDA_ARCH__ < CC_VOLTA
1346
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.pascal.nwarps, 2)
1347
- #endif // __CUDA_ARCH__ < CC_VOLTA
1348
- mul_mat_q3_K(
1349
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1350
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1351
-
1352
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1353
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q3_K);
1354
-
1355
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q3_K<arch_config.y>,
1356
- load_tiles_q3_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
1357
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1358
- #else
1359
- GGML_UNUSED(get_arch_config_device);
1360
- GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
1361
- NO_DEVICE_CODE;
1362
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1363
- }
1364
-
1365
- template <bool need_check> static __global__ void
1366
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1367
- #if defined(RDNA3) || defined(RDNA2)
1368
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.rdna2.nwarps, 2)
1369
- #endif // defined(RDNA3) || defined(RDNA2)
1370
- #elif __CUDA_ARCH__ < CC_VOLTA
1371
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1372
- #endif // __CUDA_ARCH__ < CC_VOLTA
1373
- mul_mat_q4_K(
1374
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1375
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1376
-
1377
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1378
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_K);
1379
-
1380
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_K<arch_config.y>,
1381
- load_tiles_q4_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
1382
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1383
- #else
1384
- GGML_UNUSED(get_arch_config_device);
1385
- GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
1386
- NO_DEVICE_CODE;
1387
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1388
- }
1389
-
1390
- template <bool need_check> static __global__ void
1391
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1392
- #if defined(RDNA3) || defined(RDNA2)
1393
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_K.rdna2.nwarps, 2)
1394
- #endif // defined(RDNA3) || defined(RDNA2)
1395
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1396
- mul_mat_q5_K(
1397
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1398
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1399
-
1400
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1401
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_K);
1402
-
1403
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_K<arch_config.y>,
1404
- load_tiles_q5_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
1405
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1406
- #else
1407
- GGML_UNUSED(get_arch_config_device);
1408
- GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
1409
- NO_DEVICE_CODE;
1410
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1411
- }
1412
-
1413
- template <bool need_check> static __global__ void
1414
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
1415
- #if defined(RDNA3) || defined(RDNA2)
1416
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q6_K.rdna2.nwarps, 2)
1417
- #endif // defined(RDNA3) || defined(RDNA2)
1418
- #elif __CUDA_ARCH__ < CC_VOLTA
1419
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
1420
- #endif // __CUDA_ARCH__ < CC_VOLTA
1421
- mul_mat_q6_K(
1422
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
1423
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
1424
-
1425
- #if __CUDA_ARCH__ >= MIN_CC_DP4A
1426
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q6_K);
1427
-
1428
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q6_K<arch_config.y>,
1429
- load_tiles_q6_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
1430
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
1431
- #else
1432
- GGML_UNUSED(get_arch_config_device);
1433
- GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
1434
- NO_DEVICE_CODE;
1435
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1436
- }
1437
-
1438
- #define MMQ_SWITCH_CASE(type_suffix) \
1439
- case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) { \
1440
- const bool need_check = false; \
1441
- mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1442
- (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1443
- } else { \
1444
- const bool need_check = true; \
1445
- mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
1446
- (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
1447
- } break; \
1448
2
 
1449
3
  void ggml_cuda_op_mul_mat_q(
1450
4
  ggml_backend_cuda_context & ctx,
@@ -1454,12 +8,16 @@ void ggml_cuda_op_mul_mat_q(
1454
8
 
1455
9
  const int64_t ne00 = src0->ne[0];
1456
10
 
11
+ const int64_t nb01 = src0->nb[1];
12
+
1457
13
  const int64_t ne10 = src1->ne[0];
14
+ const int64_t ne11 = src1->ne[1];
1458
15
  GGML_ASSERT(ne10 % QK8_1 == 0);
1459
16
 
1460
17
  const int64_t ne0 = dst->ne[0];
1461
18
 
1462
19
  const int64_t row_diff = row_high - row_low;
20
+ const int64_t stride00 = nb01 / ggml_type_size(src0->type);
1463
21
 
1464
22
  int id = ggml_cuda_get_device();
1465
23
  const int compute_capability = ggml_cuda_info().devices[id].cc;
@@ -1468,73 +26,39 @@ void ggml_cuda_op_mul_mat_q(
1468
26
  // nrows_dst == nrows of the matrix that the kernel writes into
1469
27
  const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
1470
28
 
1471
- mmq_config_t mmq_config;
29
+ const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
1472
30
 
1473
31
  switch (src0->type) {
1474
32
  case GGML_TYPE_Q4_0:
1475
- mmq_config = MMQ_CONFIG_Q4_0;
33
+ mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
1476
34
  break;
1477
35
  case GGML_TYPE_Q4_1:
1478
- mmq_config = MMQ_CONFIG_Q4_1;
36
+ mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
1479
37
  break;
1480
38
  case GGML_TYPE_Q5_0:
1481
- mmq_config = MMQ_CONFIG_Q5_0;
39
+ mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
1482
40
  break;
1483
41
  case GGML_TYPE_Q5_1:
1484
- mmq_config = MMQ_CONFIG_Q5_1;
42
+ mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
1485
43
  break;
1486
44
  case GGML_TYPE_Q8_0:
1487
- mmq_config = MMQ_CONFIG_Q8_0;
45
+ mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
1488
46
  break;
1489
47
  case GGML_TYPE_Q2_K:
1490
- mmq_config = MMQ_CONFIG_Q2_K;
48
+ mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
1491
49
  break;
1492
50
  case GGML_TYPE_Q3_K:
1493
- mmq_config = MMQ_CONFIG_Q3_K;
51
+ mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
1494
52
  break;
1495
53
  case GGML_TYPE_Q4_K:
1496
- mmq_config = MMQ_CONFIG_Q4_K;
54
+ mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
1497
55
  break;
1498
56
  case GGML_TYPE_Q5_K:
1499
- mmq_config = MMQ_CONFIG_Q5_K;
57
+ mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
1500
58
  break;
1501
59
  case GGML_TYPE_Q6_K:
1502
- mmq_config = MMQ_CONFIG_Q6_K;
1503
- break;
1504
- default:
1505
- GGML_ASSERT(false);
60
+ mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
1506
61
  break;
1507
- }
1508
-
1509
- mmq_arch_config_t arch_config;
1510
- if (compute_capability >= CC_RDNA2) {
1511
- arch_config = mmq_config.rdna2;
1512
- } else if (compute_capability >= CC_OFFSET_AMD) {
1513
- arch_config = mmq_config.rdna1;
1514
- } else if (compute_capability >= CC_VOLTA) {
1515
- arch_config = mmq_config.ampere;
1516
- } else if (compute_capability >= MIN_CC_DP4A) {
1517
- arch_config = mmq_config.pascal;
1518
- } else {
1519
- GGML_ASSERT(false);
1520
- }
1521
-
1522
- const int block_num_x = (row_diff + arch_config.y - 1) / arch_config.y;
1523
- const int block_num_y = (src1_ncols + arch_config.x - 1) / arch_config.x;
1524
- const dim3 block_nums(block_num_x, block_num_y, 1);
1525
- const dim3 block_dims(WARP_SIZE, arch_config.nwarps, 1);
1526
-
1527
- switch (src0->type) {
1528
- MMQ_SWITCH_CASE(4_0)
1529
- MMQ_SWITCH_CASE(4_1)
1530
- MMQ_SWITCH_CASE(5_0)
1531
- MMQ_SWITCH_CASE(5_1)
1532
- MMQ_SWITCH_CASE(8_0)
1533
- MMQ_SWITCH_CASE(2_K)
1534
- MMQ_SWITCH_CASE(3_K)
1535
- MMQ_SWITCH_CASE(4_K)
1536
- MMQ_SWITCH_CASE(5_K)
1537
- MMQ_SWITCH_CASE(6_K)
1538
62
  default:
1539
63
  GGML_ASSERT(false);
1540
64
  break;