llama_cpp 0.15.4 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/llama_cpp/extconf.rb +1 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +15 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +13 -1
  7. data/vendor/tmp/llama.cpp/Makefile +62 -35
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
  9. data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
  10. data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
  11. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  12. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  13. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
  14. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
  131. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
  132. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  133. data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
  134. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  135. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
  136. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
  137. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
  138. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
  139. data/vendor/tmp/llama.cpp/ggml.c +178 -330
  140. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  141. data/vendor/tmp/llama.cpp/llama.cpp +242 -426
  142. data/vendor/tmp/llama.cpp/llama.h +17 -43
  143. metadata +121 -6
  144. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  145. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  146. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  147. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -0,0 +1,662 @@
1
+ #include "dmmv.cuh"
2
+ #include "dequantize.cuh"
3
+ #include "convert.cuh"
4
+
5
+ #ifndef K_QUANTS_PER_ITERATION
6
+ #define K_QUANTS_PER_ITERATION 2
7
+ #else
8
+ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
9
+ #endif
10
+
11
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
12
+
13
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
14
+
15
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
16
+ if (row > nrows) return;
17
+
18
+ const int num_blocks_per_row = ncols / QK_K;
19
+ const int ib0 = row*num_blocks_per_row;
20
+
21
+ const block_q2_K * x = (const block_q2_K *)vx + ib0;
22
+
23
+ float tmp = 0; // partial sum for thread in warp
24
+
25
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
26
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
27
+
28
+ const int step = 16/K_QUANTS_PER_ITERATION;
29
+
30
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
31
+ const int in = tid - step*im; // 0...15 or 0...7
32
+
33
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
34
+ const int q_offset = 32*im + l0;
35
+ const int s_offset = 8*im;
36
+ const int y_offset = 128*im + l0;
37
+
38
+ uint32_t aux[4];
39
+ const uint8_t * d = (const uint8_t *)aux;
40
+ const uint8_t * m = (const uint8_t *)(aux + 2);
41
+
42
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
43
+
44
+ const float * y = yy + i * QK_K + y_offset;
45
+ const uint8_t * q = x[i].qs + q_offset;
46
+
47
+ const float dall = __low2half(x[i].dm);
48
+ const float dmin = __high2half(x[i].dm);
49
+
50
+ const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
51
+ aux[0] = a[0] & 0x0f0f0f0f;
52
+ aux[1] = a[1] & 0x0f0f0f0f;
53
+ aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
54
+ aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
55
+
56
+ float sum1 = 0, sum2 = 0;
57
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
58
+ sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
59
+ + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
60
+ + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
61
+ + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
62
+ + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
63
+ + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
64
+ + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
65
+ +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
66
+ sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
67
+ + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
68
+
69
+ }
70
+ tmp += dall * sum1 - dmin * sum2;
71
+
72
+ }
73
+
74
+ // sum up partial sums and write back result
75
+ tmp = warp_reduce_sum(tmp);
76
+
77
+ if (threadIdx.x == 0) {
78
+ dst[row] = tmp;
79
+ }
80
+ }
81
+
82
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
83
+
84
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
85
+ if (row > nrows) return;
86
+
87
+ const int num_blocks_per_row = ncols / QK_K;
88
+ const int ib0 = row*num_blocks_per_row;
89
+
90
+ const block_q3_K * x = (const block_q3_K *)vx + ib0;
91
+
92
+ float tmp = 0; // partial sum for thread in warp
93
+
94
+ const uint16_t kmask1 = 0x0303;
95
+ const uint16_t kmask2 = 0x0f0f;
96
+
97
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
98
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
99
+
100
+ const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
101
+ const int step = 16/K_QUANTS_PER_ITERATION;
102
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
103
+ const int in = tid - step*im; // 0....15 or 0...7
104
+
105
+ const uint8_t m = 1 << (4*im);
106
+
107
+ const int l0 = n*in; // 0...15 or 0...14 in steps of 2
108
+ const int q_offset = 32*im + l0;
109
+ const int y_offset = 128*im + l0;
110
+
111
+ uint16_t utmp[4];
112
+ const int8_t * s = (const int8_t *)utmp;
113
+
114
+ const uint16_t s_shift = 4*im;
115
+
116
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
117
+
118
+ const float * y = yy + i * QK_K + y_offset;
119
+ const uint8_t * q = x[i].qs + q_offset;
120
+ const uint8_t * h = x[i].hmask + l0;
121
+
122
+ const uint16_t * a = (const uint16_t *)x[i].scales;
123
+ utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
124
+ utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
125
+ utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
126
+ utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
127
+
128
+ const float d = x[i].d;
129
+
130
+ float sum = 0;
131
+ for (int l = 0; l < n; ++l) {
132
+ sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
133
+ + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
134
+ + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
135
+ + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
136
+ sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
137
+ + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
138
+ + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
139
+ + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
140
+ }
141
+ tmp += d * sum;
142
+
143
+ }
144
+
145
+ // sum up partial sums and write back result
146
+ tmp = warp_reduce_sum(tmp);
147
+
148
+ if (threadIdx.x == 0) {
149
+ dst[row] = tmp;
150
+ }
151
+ }
152
+
153
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
154
+
155
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
156
+ if (row > nrows) return;
157
+ const int num_blocks_per_row = ncols / QK_K;
158
+ const int ib0 = row*num_blocks_per_row;
159
+
160
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
161
+
162
+ const uint16_t kmask1 = 0x3f3f;
163
+ const uint16_t kmask2 = 0x0f0f;
164
+ const uint16_t kmask3 = 0xc0c0;
165
+
166
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
167
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
168
+
169
+ const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
170
+
171
+ const int il = tid/step; // 0...3
172
+ const int ir = tid - step*il; // 0...7 or 0...3
173
+ const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
174
+
175
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
176
+ const int in = il%2;
177
+
178
+ const int l0 = n*(2*ir + in);
179
+ const int q_offset = 32*im + l0;
180
+ const int y_offset = 64*im + l0;
181
+
182
+ uint16_t aux[4];
183
+ const uint8_t * sc = (const uint8_t *)aux;
184
+
185
+ #if K_QUANTS_PER_ITERATION == 2
186
+ uint32_t q32[4];
187
+ const uint8_t * q4 = (const uint8_t *)q32;
188
+ #else
189
+ uint16_t q16[4];
190
+ const uint8_t * q4 = (const uint8_t *)q16;
191
+ #endif
192
+
193
+ float tmp = 0; // partial sum for thread in warp
194
+
195
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
196
+
197
+ const float * y1 = yy + i*QK_K + y_offset;
198
+ const float * y2 = y1 + 128;
199
+
200
+ const float dall = __low2half(x[i].dm);
201
+ const float dmin = __high2half(x[i].dm);
202
+
203
+ const uint16_t * a = (const uint16_t *)x[i].scales;
204
+ aux[0] = a[im+0] & kmask1;
205
+ aux[1] = a[im+2] & kmask1;
206
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
207
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
208
+
209
+ #if K_QUANTS_PER_ITERATION == 2
210
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
211
+ const uint32_t * q2 = q1 + 16;
212
+
213
+ q32[0] = q1[0] & 0x0f0f0f0f;
214
+ q32[1] = q1[0] & 0xf0f0f0f0;
215
+ q32[2] = q2[0] & 0x0f0f0f0f;
216
+ q32[3] = q2[0] & 0xf0f0f0f0;
217
+
218
+ float4 s = {0.f, 0.f, 0.f, 0.f};
219
+ float smin = 0;
220
+ for (int l = 0; l < 4; ++l) {
221
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
222
+ s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
223
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
224
+ }
225
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
226
+ #else
227
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
228
+ const uint16_t * q2 = q1 + 32;
229
+
230
+ q16[0] = q1[0] & 0x0f0f;
231
+ q16[1] = q1[0] & 0xf0f0;
232
+ q16[2] = q2[0] & 0x0f0f;
233
+ q16[3] = q2[0] & 0xf0f0;
234
+
235
+ float4 s = {0.f, 0.f, 0.f, 0.f};
236
+ float smin = 0;
237
+ for (int l = 0; l < 2; ++l) {
238
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
239
+ s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
240
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
241
+ }
242
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
243
+ #endif
244
+
245
+ }
246
+
247
+ // sum up partial sums and write back result
248
+ tmp = warp_reduce_sum(tmp);
249
+
250
+ if (tid == 0) {
251
+ dst[row] = tmp;
252
+ }
253
+ }
254
+
255
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
256
+
257
+ const int row = blockIdx.x;
258
+ const int num_blocks_per_row = ncols / QK_K;
259
+ const int ib0 = row*num_blocks_per_row;
260
+
261
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
262
+
263
+ float tmp = 0; // partial sum for thread in warp
264
+
265
+ const uint16_t kmask1 = 0x3f3f;
266
+ const uint16_t kmask2 = 0x0f0f;
267
+ const uint16_t kmask3 = 0xc0c0;
268
+
269
+ const int tid = threadIdx.x/2; // 0...15
270
+ const int ix = threadIdx.x%2;
271
+
272
+ const int il = tid/4; // 0...3
273
+ const int ir = tid - 4*il;// 0...3
274
+ const int n = 2;
275
+
276
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
277
+ const int in = il%2;
278
+
279
+ const int l0 = n*(2*ir + in);
280
+ const int q_offset = 32*im + l0;
281
+ const int y_offset = 64*im + l0;
282
+
283
+ const uint8_t hm1 = 1 << (2*im);
284
+ const uint8_t hm2 = hm1 << 4;
285
+
286
+ uint16_t aux[4];
287
+ const uint8_t * sc = (const uint8_t *)aux;
288
+
289
+ uint16_t q16[8];
290
+ const uint8_t * q4 = (const uint8_t *)q16;
291
+
292
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
293
+
294
+ const uint8_t * ql1 = x[i].qs + q_offset;
295
+ const uint8_t * qh = x[i].qh + l0;
296
+ const float * y1 = yy + i*QK_K + y_offset;
297
+ const float * y2 = y1 + 128;
298
+
299
+ const float dall = __low2half(x[i].dm);
300
+ const float dmin = __high2half(x[i].dm);
301
+
302
+ const uint16_t * a = (const uint16_t *)x[i].scales;
303
+ aux[0] = a[im+0] & kmask1;
304
+ aux[1] = a[im+2] & kmask1;
305
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
306
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
307
+
308
+ float4 sum = {0.f, 0.f, 0.f, 0.f};
309
+ float smin = 0;
310
+ const uint16_t * q1 = (const uint16_t *)ql1;
311
+ const uint16_t * q2 = q1 + 32;
312
+ q16[0] = q1[0] & 0x0f0f;
313
+ q16[1] = q1[8] & 0x0f0f;
314
+ q16[2] = (q1[0] >> 4) & 0x0f0f;
315
+ q16[3] = (q1[8] >> 4) & 0x0f0f;
316
+ q16[4] = q2[0] & 0x0f0f;
317
+ q16[5] = q2[8] & 0x0f0f;
318
+ q16[6] = (q2[0] >> 4) & 0x0f0f;
319
+ q16[7] = (q2[8] >> 4) & 0x0f0f;
320
+ for (int l = 0; l < n; ++l) {
321
+ sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
322
+ + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
323
+ sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
324
+ + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
325
+ sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
326
+ + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
327
+ sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
328
+ + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
329
+ smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
330
+ + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
331
+ }
332
+ tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
333
+ }
334
+
335
+ // sum up partial sums and write back result
336
+ tmp = warp_reduce_sum(tmp);
337
+
338
+ if (threadIdx.x == 0) {
339
+ dst[row] = tmp;
340
+ }
341
+ }
342
+
343
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
344
+
345
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
346
+
347
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
348
+ if (row > nrows) return;
349
+
350
+ const int num_blocks_per_row = ncols / QK_K;
351
+ const int ib0 = row*num_blocks_per_row;
352
+
353
+ const block_q6_K * x = (const block_q6_K *)vx + ib0;
354
+
355
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
356
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
357
+
358
+ const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
359
+
360
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
361
+ const int in = tid - step*im; // 0...15 or 0...7
362
+
363
+ #if K_QUANTS_PER_ITERATION == 1
364
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
365
+ const int is = 0;
366
+ #else
367
+ const int l0 = 4 * in; // 0, 4, 8, ..., 28
368
+ const int is = in / 4;
369
+ #endif
370
+ const int ql_offset = 64*im + l0;
371
+ const int qh_offset = 32*im + l0;
372
+ const int s_offset = 8*im + is;
373
+ const int y_offset = 128*im + l0;
374
+
375
+ float tmp = 0; // partial sum for thread in warp
376
+
377
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
378
+
379
+ const float * y = yy + i * QK_K + y_offset;
380
+ const uint8_t * ql = x[i].ql + ql_offset;
381
+ const uint8_t * qh = x[i].qh + qh_offset;
382
+ const int8_t * s = x[i].scales + s_offset;
383
+
384
+ const float d = x[i].d;
385
+
386
+ #if K_QUANTS_PER_ITERATION == 1
387
+ float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
388
+ + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
389
+ + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
390
+ + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
391
+ + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
392
+ + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
393
+ + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
394
+ +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
395
+ tmp += sum;
396
+ #else
397
+ float sum = 0;
398
+ for (int l = 0; l < 4; ++l) {
399
+ sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
400
+ + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
401
+ + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
402
+ + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
403
+ }
404
+ tmp += sum;
405
+ #endif
406
+
407
+ }
408
+
409
+ // sum up partial sums and write back result
410
+ tmp = warp_reduce_sum(tmp);
411
+
412
+ if (tid == 0) {
413
+ dst[row] = tmp;
414
+ }
415
+ }
416
+
417
+ static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
418
+ const half * x = (const half *) vx;
419
+
420
+ // automatic half -> float type cast if dfloat == float
421
+ v.x = x[ib + iqs + 0];
422
+ v.y = x[ib + iqs + 1];
423
+ }
424
+
425
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
426
+ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
427
+ // qk = quantized weights per x block
428
+ // qr = number of quantized weights per data value in x block
429
+ const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
430
+
431
+ if (row >= nrows) {
432
+ return;
433
+ }
434
+
435
+ const int tid = threadIdx.x;
436
+
437
+ const int iter_stride = 2*GGML_CUDA_DMMV_X;
438
+ const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
439
+ const int y_offset = qr == 1 ? 1 : qk/2;
440
+
441
+ // partial sum for each thread
442
+ #ifdef GGML_CUDA_F16
443
+ half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
444
+ #else
445
+ float tmp = 0.0f;
446
+ #endif // GGML_CUDA_F16
447
+
448
+ for (int i = 0; i < ncols; i += iter_stride) {
449
+ const int col = i + vals_per_iter*tid;
450
+ const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
451
+ const int iqs = (col%qk)/qr; // x quant index
452
+ const int iybs = col - col%qk; // y block start index
453
+
454
+ // processing >2 values per i iter is faster for fast GPUs
455
+ #pragma unroll
456
+ for (int j = 0; j < vals_per_iter; j += 2) {
457
+ // process 2 vals per j iter
458
+
459
+ // dequantize
460
+ // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
461
+ dfloat2 v;
462
+ dequantize_kernel(vx, ib, iqs + j/qr, v);
463
+
464
+ // matrix multiplication
465
+ // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
466
+ #ifdef GGML_CUDA_F16
467
+ tmp += __hmul2(v, {
468
+ y[iybs + iqs + j/qr + 0],
469
+ y[iybs + iqs + j/qr + y_offset]
470
+ });
471
+ #else
472
+ tmp += v.x * y[iybs + iqs + j/qr + 0];
473
+ tmp += v.y * y[iybs + iqs + j/qr + y_offset];
474
+ #endif // GGML_CUDA_F16
475
+ }
476
+ }
477
+
478
+ // sum up partial sums and write back result
479
+ tmp = warp_reduce_sum(tmp);
480
+
481
+ if (tid == 0) {
482
+ #ifdef GGML_CUDA_F16
483
+ dst[row] = tmp.x + tmp.y;
484
+ #else
485
+ dst[row] = tmp;
486
+ #endif // GGML_CUDA_F16
487
+ }
488
+ }
489
+
490
+ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
491
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
492
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
493
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
494
+ const dim3 block_nums(block_num_y, 1, 1);
495
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
496
+ dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
497
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
498
+ }
499
+
500
+ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
501
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
502
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
503
+ const dim3 block_nums(block_num_y, 1, 1);
504
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
505
+ dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
506
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
507
+ }
508
+
509
+ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
510
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
511
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
512
+ const dim3 block_nums(block_num_y, 1, 1);
513
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
514
+ dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
515
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
516
+ }
517
+
518
+ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
519
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
520
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
521
+ const dim3 block_nums(block_num_y, 1, 1);
522
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
523
+ dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
524
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
525
+ }
526
+
527
+ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
528
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
529
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
530
+ const dim3 block_nums(block_num_y, 1, 1);
531
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
532
+ dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
533
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
534
+ }
535
+
536
+ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
537
+ GGML_ASSERT(ncols % QK_K == 0);
538
+ const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
539
+ const int block_num_y = (nrows + ny - 1) / ny;
540
+ const dim3 block_nums(block_num_y, 1, 1);
541
+ const dim3 block_dims(32, ny, 1);
542
+ dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
543
+ }
544
+
545
+ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
546
+ GGML_ASSERT(ncols % QK_K == 0);
547
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
548
+ const int block_num_y = (nrows + ny - 1) / ny;
549
+ const dim3 block_nums(block_num_y, 1, 1);
550
+ const dim3 block_dims(32, ny, 1);
551
+ dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
552
+ }
553
+
554
+ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
555
+ GGML_ASSERT(ncols % QK_K == 0);
556
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
557
+ const int block_num_y = (nrows + ny - 1) / ny;
558
+ const dim3 block_nums(block_num_y, 1, 1);
559
+ const dim3 block_dims(32, ny, 1);
560
+ dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
561
+ }
562
+
563
+ static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
564
+ GGML_ASSERT(ncols % QK_K == 0);
565
+ const dim3 block_dims(32, 1, 1);
566
+ dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
567
+ }
568
+
569
+ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
570
+ GGML_ASSERT(ncols % QK_K == 0);
571
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
572
+ const int block_num_y = (nrows + ny - 1) / ny;
573
+ const dim3 block_nums(block_num_y, 1, 1);
574
+ const dim3 block_dims(32, ny, 1);
575
+ dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
576
+ }
577
+
578
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
579
+ GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
580
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
581
+ const dim3 block_nums(block_num_y, 1, 1);
582
+ const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
583
+ dequantize_mul_mat_vec<1, 1, convert_f16>
584
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
585
+ }
586
+
587
+ void ggml_cuda_op_dequantize_mul_mat_vec(
588
+ ggml_backend_cuda_context & ctx,
589
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
590
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
591
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
592
+ GGML_UNUSED(ctx);
593
+ const int64_t ne00 = src0->ne[0];
594
+ const int64_t row_diff = row_high - row_low;
595
+
596
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
597
+
598
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
599
+ #ifdef GGML_CUDA_F16
600
+ ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
601
+ half * src1_dfloat = nullptr; // dfloat == half
602
+
603
+ bool src1_convert_f16 =
604
+ src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
605
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
606
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
607
+
608
+ if (src1_convert_f16) {
609
+ src1_dfloat = src1_dfloat_a.alloc(ne00);
610
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
611
+ GGML_ASSERT(to_fp16_cuda != nullptr);
612
+ to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
613
+ }
614
+ #else
615
+ const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
616
+ #endif // GGML_CUDA_F16
617
+
618
+ switch (src0->type) {
619
+ case GGML_TYPE_Q4_0:
620
+ dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
621
+ break;
622
+ case GGML_TYPE_Q4_1:
623
+ dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
624
+ break;
625
+ case GGML_TYPE_Q5_0:
626
+ dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
627
+ break;
628
+ case GGML_TYPE_Q5_1:
629
+ dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
630
+ break;
631
+ case GGML_TYPE_Q8_0:
632
+ dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
633
+ break;
634
+ case GGML_TYPE_Q2_K:
635
+ dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
636
+ break;
637
+ case GGML_TYPE_Q3_K:
638
+ dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
639
+ break;
640
+ case GGML_TYPE_Q4_K:
641
+ dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
642
+ break;
643
+ case GGML_TYPE_Q5_K:
644
+ dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
645
+ break;
646
+ case GGML_TYPE_Q6_K:
647
+ dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
648
+ break;
649
+ case GGML_TYPE_F16:
650
+ convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
651
+ break;
652
+ default:
653
+ GGML_ASSERT(false);
654
+ break;
655
+ }
656
+
657
+ GGML_UNUSED(src1);
658
+ GGML_UNUSED(dst);
659
+ GGML_UNUSED(src1_ddq_i);
660
+ GGML_UNUSED(src1_ncols);
661
+ GGML_UNUSED(src1_padded_row_size);
662
+ }