llama_cpp 0.16.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,674 +0,0 @@
1
- #include "dmmv.cuh"
2
- #include "dequantize.cuh"
3
- #include "convert.cuh"
4
-
5
- #ifndef K_QUANTS_PER_ITERATION
6
- #define K_QUANTS_PER_ITERATION 2
7
- #else
8
- static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
9
- #endif
10
-
11
- static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
12
-
13
- static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
14
-
15
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
16
- if (row > nrows) return;
17
-
18
- const int num_blocks_per_row = ncols / QK_K;
19
- const int ib0 = row*num_blocks_per_row;
20
-
21
- const block_q2_K * x = (const block_q2_K *)vx + ib0;
22
-
23
- float tmp = 0; // partial sum for thread in warp
24
-
25
- const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
26
- const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
27
-
28
- const int step = 16/K_QUANTS_PER_ITERATION;
29
-
30
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
31
- const int in = tid - step*im; // 0...15 or 0...7
32
-
33
- const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
34
- const int q_offset = 32*im + l0;
35
- const int s_offset = 8*im;
36
- const int y_offset = 128*im + l0;
37
-
38
- uint32_t aux[4];
39
- const uint8_t * d = (const uint8_t *)aux;
40
- const uint8_t * m = (const uint8_t *)(aux + 2);
41
-
42
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
43
-
44
- const float * y = yy + i * QK_K + y_offset;
45
- const uint8_t * q = x[i].qs + q_offset;
46
-
47
- const float dall = __low2half(x[i].dm);
48
- const float dmin = __high2half(x[i].dm);
49
-
50
- const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
51
- aux[0] = a[0] & 0x0f0f0f0f;
52
- aux[1] = a[1] & 0x0f0f0f0f;
53
- aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
54
- aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
55
-
56
- float sum1 = 0, sum2 = 0;
57
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
58
- sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
59
- + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
60
- + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
61
- + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
62
- + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
63
- + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
64
- + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
65
- +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
66
- sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
67
- + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
68
-
69
- }
70
- tmp += dall * sum1 - dmin * sum2;
71
-
72
- }
73
-
74
- // sum up partial sums and write back result
75
- tmp = warp_reduce_sum(tmp);
76
-
77
- if (threadIdx.x == 0) {
78
- dst[row] = tmp;
79
- }
80
- }
81
-
82
- static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
83
-
84
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
85
- if (row > nrows) return;
86
-
87
- const int num_blocks_per_row = ncols / QK_K;
88
- const int ib0 = row*num_blocks_per_row;
89
-
90
- const block_q3_K * x = (const block_q3_K *)vx + ib0;
91
-
92
- float tmp = 0; // partial sum for thread in warp
93
-
94
- const uint16_t kmask1 = 0x0303;
95
- const uint16_t kmask2 = 0x0f0f;
96
-
97
- const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
98
- const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
99
-
100
- const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
101
- const int step = 16/K_QUANTS_PER_ITERATION;
102
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
103
- const int in = tid - step*im; // 0....15 or 0...7
104
-
105
- const uint8_t m = 1 << (4*im);
106
-
107
- const int l0 = n*in; // 0...15 or 0...14 in steps of 2
108
- const int q_offset = 32*im + l0;
109
- const int y_offset = 128*im + l0;
110
-
111
- uint16_t utmp[4];
112
- const int8_t * s = (const int8_t *)utmp;
113
-
114
- const uint16_t s_shift = 4*im;
115
-
116
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
117
-
118
- const float * y = yy + i * QK_K + y_offset;
119
- const uint8_t * q = x[i].qs + q_offset;
120
- const uint8_t * h = x[i].hmask + l0;
121
-
122
- const uint16_t * a = (const uint16_t *)x[i].scales;
123
- utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
124
- utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
125
- utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
126
- utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
127
-
128
- const float d = x[i].d;
129
-
130
- float sum = 0;
131
- for (int l = 0; l < n; ++l) {
132
- sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
133
- + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
134
- + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
135
- + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
136
- sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
137
- + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
138
- + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
139
- + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
140
- }
141
- tmp += d * sum;
142
-
143
- }
144
-
145
- // sum up partial sums and write back result
146
- tmp = warp_reduce_sum(tmp);
147
-
148
- if (threadIdx.x == 0) {
149
- dst[row] = tmp;
150
- }
151
- }
152
-
153
- static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
154
-
155
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
156
- if (row > nrows) return;
157
- const int num_blocks_per_row = ncols / QK_K;
158
- const int ib0 = row*num_blocks_per_row;
159
-
160
- const block_q4_K * x = (const block_q4_K *)vx + ib0;
161
-
162
- const uint16_t kmask1 = 0x3f3f;
163
- const uint16_t kmask2 = 0x0f0f;
164
- const uint16_t kmask3 = 0xc0c0;
165
-
166
- const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
167
- const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
168
-
169
- const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
170
-
171
- const int il = tid/step; // 0...3
172
- const int ir = tid - step*il; // 0...7 or 0...3
173
- const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
174
-
175
- const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
176
- const int in = il%2;
177
-
178
- const int l0 = n*(2*ir + in);
179
- const int q_offset = 32*im + l0;
180
- const int y_offset = 64*im + l0;
181
-
182
- uint16_t aux[4];
183
- const uint8_t * sc = (const uint8_t *)aux;
184
-
185
- #if K_QUANTS_PER_ITERATION == 2
186
- uint32_t q32[4];
187
- const uint8_t * q4 = (const uint8_t *)q32;
188
- #else
189
- uint16_t q16[4];
190
- const uint8_t * q4 = (const uint8_t *)q16;
191
- #endif
192
-
193
- float tmp = 0; // partial sum for thread in warp
194
-
195
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
196
-
197
- const float * y1 = yy + i*QK_K + y_offset;
198
- const float * y2 = y1 + 128;
199
-
200
- const float dall = __low2half(x[i].dm);
201
- const float dmin = __high2half(x[i].dm);
202
-
203
- const uint16_t * a = (const uint16_t *)x[i].scales;
204
- aux[0] = a[im+0] & kmask1;
205
- aux[1] = a[im+2] & kmask1;
206
- aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
207
- aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
208
-
209
- #if K_QUANTS_PER_ITERATION == 2
210
- const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
211
- const uint32_t * q2 = q1 + 16;
212
-
213
- q32[0] = q1[0] & 0x0f0f0f0f;
214
- q32[1] = q1[0] & 0xf0f0f0f0;
215
- q32[2] = q2[0] & 0x0f0f0f0f;
216
- q32[3] = q2[0] & 0xf0f0f0f0;
217
-
218
- float4 s = {0.f, 0.f, 0.f, 0.f};
219
- float smin = 0;
220
- for (int l = 0; l < 4; ++l) {
221
- s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
222
- s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
223
- smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
224
- }
225
- tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
226
- #else
227
- const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
228
- const uint16_t * q2 = q1 + 32;
229
-
230
- q16[0] = q1[0] & 0x0f0f;
231
- q16[1] = q1[0] & 0xf0f0;
232
- q16[2] = q2[0] & 0x0f0f;
233
- q16[3] = q2[0] & 0xf0f0;
234
-
235
- float4 s = {0.f, 0.f, 0.f, 0.f};
236
- float smin = 0;
237
- for (int l = 0; l < 2; ++l) {
238
- s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
239
- s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
240
- smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
241
- }
242
- tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
243
- #endif
244
-
245
- }
246
-
247
- // sum up partial sums and write back result
248
- tmp = warp_reduce_sum(tmp);
249
-
250
- if (tid == 0) {
251
- dst[row] = tmp;
252
- }
253
- }
254
-
255
- static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
256
-
257
- const int row = blockIdx.x;
258
- const int num_blocks_per_row = ncols / QK_K;
259
- const int ib0 = row*num_blocks_per_row;
260
-
261
- const block_q5_K * x = (const block_q5_K *)vx + ib0;
262
-
263
- float tmp = 0; // partial sum for thread in warp
264
-
265
- const uint16_t kmask1 = 0x3f3f;
266
- const uint16_t kmask2 = 0x0f0f;
267
- const uint16_t kmask3 = 0xc0c0;
268
-
269
- const int tid = threadIdx.x/2; // 0...15
270
- const int ix = threadIdx.x%2;
271
-
272
- const int il = tid/4; // 0...3
273
- const int ir = tid - 4*il;// 0...3
274
- const int n = 2;
275
-
276
- const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
277
- const int in = il%2;
278
-
279
- const int l0 = n*(2*ir + in);
280
- const int q_offset = 32*im + l0;
281
- const int y_offset = 64*im + l0;
282
-
283
- const uint8_t hm1 = 1 << (2*im);
284
- const uint8_t hm2 = hm1 << 4;
285
-
286
- uint16_t aux[4];
287
- const uint8_t * sc = (const uint8_t *)aux;
288
-
289
- uint16_t q16[8];
290
- const uint8_t * q4 = (const uint8_t *)q16;
291
-
292
- for (int i = ix; i < num_blocks_per_row; i += 2) {
293
-
294
- const uint8_t * ql1 = x[i].qs + q_offset;
295
- const uint8_t * qh = x[i].qh + l0;
296
- const float * y1 = yy + i*QK_K + y_offset;
297
- const float * y2 = y1 + 128;
298
-
299
- const float dall = __low2half(x[i].dm);
300
- const float dmin = __high2half(x[i].dm);
301
-
302
- const uint16_t * a = (const uint16_t *)x[i].scales;
303
- aux[0] = a[im+0] & kmask1;
304
- aux[1] = a[im+2] & kmask1;
305
- aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
306
- aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
307
-
308
- float4 sum = {0.f, 0.f, 0.f, 0.f};
309
- float smin = 0;
310
- const uint16_t * q1 = (const uint16_t *)ql1;
311
- const uint16_t * q2 = q1 + 32;
312
- q16[0] = q1[0] & 0x0f0f;
313
- q16[1] = q1[8] & 0x0f0f;
314
- q16[2] = (q1[0] >> 4) & 0x0f0f;
315
- q16[3] = (q1[8] >> 4) & 0x0f0f;
316
- q16[4] = q2[0] & 0x0f0f;
317
- q16[5] = q2[8] & 0x0f0f;
318
- q16[6] = (q2[0] >> 4) & 0x0f0f;
319
- q16[7] = (q2[8] >> 4) & 0x0f0f;
320
- for (int l = 0; l < n; ++l) {
321
- sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
322
- + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
323
- sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
324
- + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
325
- sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
326
- + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
327
- sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
328
- + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
329
- smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
330
- + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
331
- }
332
- tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
333
- }
334
-
335
- // sum up partial sums and write back result
336
- tmp = warp_reduce_sum(tmp);
337
-
338
- if (threadIdx.x == 0) {
339
- dst[row] = tmp;
340
- }
341
- }
342
-
343
- static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
344
-
345
- static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
346
-
347
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
348
- if (row > nrows) return;
349
-
350
- const int num_blocks_per_row = ncols / QK_K;
351
- const int ib0 = row*num_blocks_per_row;
352
-
353
- const block_q6_K * x = (const block_q6_K *)vx + ib0;
354
-
355
- const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
356
- const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
357
-
358
- const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
359
-
360
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
361
- const int in = tid - step*im; // 0...15 or 0...7
362
-
363
- #if K_QUANTS_PER_ITERATION == 1
364
- const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
365
- const int is = 0;
366
- #else
367
- const int l0 = 4 * in; // 0, 4, 8, ..., 28
368
- const int is = in / 4;
369
- #endif
370
- const int ql_offset = 64*im + l0;
371
- const int qh_offset = 32*im + l0;
372
- const int s_offset = 8*im + is;
373
- const int y_offset = 128*im + l0;
374
-
375
- float tmp = 0; // partial sum for thread in warp
376
-
377
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
378
-
379
- const float * y = yy + i * QK_K + y_offset;
380
- const uint8_t * ql = x[i].ql + ql_offset;
381
- const uint8_t * qh = x[i].qh + qh_offset;
382
- const int8_t * s = x[i].scales + s_offset;
383
-
384
- const float d = x[i].d;
385
-
386
- #if K_QUANTS_PER_ITERATION == 1
387
- float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
388
- + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
389
- + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
390
- + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
391
- + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
392
- + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
393
- + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
394
- +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
395
- tmp += sum;
396
- #else
397
- float sum = 0;
398
- for (int l = 0; l < 4; ++l) {
399
- sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
400
- + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
401
- + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
402
- + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
403
- }
404
- tmp += sum;
405
- #endif
406
-
407
- }
408
-
409
- // sum up partial sums and write back result
410
- tmp = warp_reduce_sum(tmp);
411
-
412
- if (tid == 0) {
413
- dst[row] = tmp;
414
- }
415
- }
416
-
417
- static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
418
- const half * x = (const half *) vx;
419
-
420
- // automatic half -> float type cast if dfloat == float
421
- v.x = x[ib + iqs + 0];
422
- v.y = x[ib + iqs + 1];
423
- }
424
-
425
- static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
426
- return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
427
- type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
428
- type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
429
- type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
430
- type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
431
- type == GGML_TYPE_F16 ? convert_f16 :
432
- nullptr;
433
- }
434
-
435
- template <ggml_type type>
436
- static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
437
- constexpr int qk = ggml_cuda_type_traits<type>::qk; // quantized weights per x block
438
- constexpr int qr = ggml_cuda_type_traits<type>::qr; // number of quantized weights per data value in x block
439
- constexpr dequantize_kernel_t dequantize_kernel = get_dequantize_kernel(type);
440
-
441
- const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
442
-
443
- if (row >= nrows) {
444
- return;
445
- }
446
-
447
- const int tid = threadIdx.x;
448
-
449
- const int iter_stride = 2*GGML_CUDA_DMMV_X;
450
- const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
451
- const int y_offset = qr == 1 ? 1 : qk/2;
452
-
453
- // partial sum for each thread
454
- #ifdef GGML_CUDA_F16
455
- half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
456
- #else
457
- float tmp = 0.0f;
458
- #endif // GGML_CUDA_F16
459
-
460
- for (int i = 0; i < ncols; i += iter_stride) {
461
- const int col = i + vals_per_iter*tid;
462
- const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
463
- const int iqs = (col%qk)/qr; // x quant index
464
- const int iybs = col - col%qk; // y block start index
465
-
466
- // processing >2 values per i iter is faster for fast GPUs
467
- #pragma unroll
468
- for (int j = 0; j < vals_per_iter; j += 2) {
469
- // process 2 vals per j iter
470
-
471
- // dequantize
472
- // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
473
- dfloat2 v;
474
- dequantize_kernel(vx, ib, iqs + j/qr, v);
475
-
476
- // matrix multiplication
477
- // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
478
- #ifdef GGML_CUDA_F16
479
- tmp += __hmul2(v, {
480
- y[iybs + iqs + j/qr + 0],
481
- y[iybs + iqs + j/qr + y_offset]
482
- });
483
- #else
484
- tmp += v.x * y[iybs + iqs + j/qr + 0];
485
- tmp += v.y * y[iybs + iqs + j/qr + y_offset];
486
- #endif // GGML_CUDA_F16
487
- }
488
- }
489
-
490
- // sum up partial sums and write back result
491
- tmp = warp_reduce_sum(tmp);
492
-
493
- if (tid == 0) {
494
- #ifdef GGML_CUDA_F16
495
- dst[row] = tmp.x + tmp.y;
496
- #else
497
- dst[row] = tmp;
498
- #endif // GGML_CUDA_F16
499
- }
500
- }
501
-
502
- static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
503
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
504
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
505
- // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
506
- const dim3 block_nums(block_num_y, 1, 1);
507
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
508
- dequantize_mul_mat_vec<GGML_TYPE_Q4_0>
509
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
510
- }
511
-
512
- static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
513
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
514
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
515
- const dim3 block_nums(block_num_y, 1, 1);
516
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
517
- dequantize_mul_mat_vec<GGML_TYPE_Q4_1>
518
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
519
- }
520
-
521
- static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
522
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
523
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
524
- const dim3 block_nums(block_num_y, 1, 1);
525
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
526
- dequantize_mul_mat_vec<GGML_TYPE_Q5_0>
527
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
528
- }
529
-
530
- static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
531
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
532
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
533
- const dim3 block_nums(block_num_y, 1, 1);
534
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
535
- dequantize_mul_mat_vec<GGML_TYPE_Q5_1>
536
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
537
- }
538
-
539
- static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
540
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
541
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
542
- const dim3 block_nums(block_num_y, 1, 1);
543
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
544
- dequantize_mul_mat_vec<GGML_TYPE_Q8_0>
545
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
546
- }
547
-
548
- static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
549
- GGML_ASSERT(ncols % QK_K == 0);
550
- const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
551
- const int block_num_y = (nrows + ny - 1) / ny;
552
- const dim3 block_nums(block_num_y, 1, 1);
553
- const dim3 block_dims(32, ny, 1);
554
- dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
555
- }
556
-
557
- static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
558
- GGML_ASSERT(ncols % QK_K == 0);
559
- const int ny = 2 / K_QUANTS_PER_ITERATION;
560
- const int block_num_y = (nrows + ny - 1) / ny;
561
- const dim3 block_nums(block_num_y, 1, 1);
562
- const dim3 block_dims(32, ny, 1);
563
- dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
564
- }
565
-
566
- static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
567
- GGML_ASSERT(ncols % QK_K == 0);
568
- const int ny = 2 / K_QUANTS_PER_ITERATION;
569
- const int block_num_y = (nrows + ny - 1) / ny;
570
- const dim3 block_nums(block_num_y, 1, 1);
571
- const dim3 block_dims(32, ny, 1);
572
- dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
573
- }
574
-
575
- static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
576
- GGML_ASSERT(ncols % QK_K == 0);
577
- const dim3 block_dims(32, 1, 1);
578
- dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
579
- }
580
-
581
- static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
582
- GGML_ASSERT(ncols % QK_K == 0);
583
- const int ny = 2 / K_QUANTS_PER_ITERATION;
584
- const int block_num_y = (nrows + ny - 1) / ny;
585
- const dim3 block_nums(block_num_y, 1, 1);
586
- const dim3 block_dims(32, ny, 1);
587
- dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
588
- }
589
-
590
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
591
- GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
592
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
593
- const dim3 block_nums(block_num_y, 1, 1);
594
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
595
- dequantize_mul_mat_vec<GGML_TYPE_F16>
596
- <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
597
- }
598
-
599
- void ggml_cuda_op_dequantize_mul_mat_vec(
600
- ggml_backend_cuda_context & ctx,
601
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
602
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
603
- const int64_t src1_padded_row_size, cudaStream_t stream) {
604
- GGML_UNUSED(ctx);
605
- const int64_t ne00 = src0->ne[0];
606
- const int64_t row_diff = row_high - row_low;
607
-
608
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
609
-
610
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
611
- #ifdef GGML_CUDA_F16
612
- ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
613
- half * src1_dfloat = nullptr; // dfloat == half
614
-
615
- bool src1_convert_f16 =
616
- src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
617
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
618
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
619
-
620
- if (src1_convert_f16) {
621
- src1_dfloat = src1_dfloat_a.alloc(ne00);
622
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
623
- GGML_ASSERT(to_fp16_cuda != nullptr);
624
- to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
625
- }
626
- #else
627
- const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
628
- #endif // GGML_CUDA_F16
629
-
630
- switch (src0->type) {
631
- case GGML_TYPE_Q4_0:
632
- dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
633
- break;
634
- case GGML_TYPE_Q4_1:
635
- dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
636
- break;
637
- case GGML_TYPE_Q5_0:
638
- dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
639
- break;
640
- case GGML_TYPE_Q5_1:
641
- dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
642
- break;
643
- case GGML_TYPE_Q8_0:
644
- dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
645
- break;
646
- case GGML_TYPE_Q2_K:
647
- dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
648
- break;
649
- case GGML_TYPE_Q3_K:
650
- dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
651
- break;
652
- case GGML_TYPE_Q4_K:
653
- dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
654
- break;
655
- case GGML_TYPE_Q5_K:
656
- dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
657
- break;
658
- case GGML_TYPE_Q6_K:
659
- dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
660
- break;
661
- case GGML_TYPE_F16:
662
- convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
663
- break;
664
- default:
665
- GGML_ASSERT(false);
666
- break;
667
- }
668
-
669
- GGML_UNUSED(src1);
670
- GGML_UNUSED(dst);
671
- GGML_UNUSED(src1_ddq_i);
672
- GGML_UNUSED(src1_ncols);
673
- GGML_UNUSED(src1_padded_row_size);
674
- }