llama_cpp 0.16.2 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,88 +0,0 @@
1
- #include "mmq.cuh"
2
-
3
- void ggml_cuda_op_mul_mat_q(
4
- ggml_backend_cuda_context & ctx,
5
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
6
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7
- const int64_t src1_padded_row_size, cudaStream_t stream) {
8
-
9
- const int64_t ne00 = src0->ne[0];
10
-
11
- const int64_t nb01 = src0->nb[1];
12
-
13
- const int64_t ne10 = src1->ne[0];
14
- const int64_t ne11 = src1->ne[1];
15
- GGML_ASSERT(ne10 % QK8_1 == 0);
16
-
17
- const int64_t ne0 = dst->ne[0];
18
-
19
- const int64_t row_diff = row_high - row_low;
20
- const int64_t stride00 = nb01 / ggml_type_size(src0->type);
21
-
22
- int id = ggml_cuda_get_device();
23
- const int compute_capability = ggml_cuda_info().devices[id].cc;
24
-
25
- // the main device has a larger memory buffer to hold the results from all GPUs
26
- // nrows_dst == nrows of the matrix that the kernel writes into
27
- const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
28
-
29
- const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
30
-
31
- switch (src0->type) {
32
- case GGML_TYPE_Q4_0:
33
- mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
34
- break;
35
- case GGML_TYPE_Q4_1:
36
- mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
37
- break;
38
- case GGML_TYPE_Q5_0:
39
- mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
40
- break;
41
- case GGML_TYPE_Q5_1:
42
- mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
43
- break;
44
- case GGML_TYPE_Q8_0:
45
- mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
46
- break;
47
- case GGML_TYPE_Q2_K:
48
- mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
49
- break;
50
- case GGML_TYPE_Q3_K:
51
- mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
52
- break;
53
- case GGML_TYPE_Q4_K:
54
- mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
55
- break;
56
- case GGML_TYPE_Q5_K:
57
- mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
58
- break;
59
- case GGML_TYPE_Q6_K:
60
- mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
61
- break;
62
- default:
63
- GGML_ASSERT(false);
64
- break;
65
- }
66
-
67
- GGML_UNUSED(src1);
68
- GGML_UNUSED(dst);
69
- GGML_UNUSED(src1_ddf_i);
70
- }
71
-
72
- bool ggml_cuda_supports_mmq(enum ggml_type type) {
73
- switch (type) {
74
- case GGML_TYPE_Q4_0:
75
- case GGML_TYPE_Q4_1:
76
- case GGML_TYPE_Q5_0:
77
- case GGML_TYPE_Q5_1:
78
- case GGML_TYPE_Q8_0:
79
- case GGML_TYPE_Q2_K:
80
- case GGML_TYPE_Q3_K:
81
- case GGML_TYPE_Q4_K:
82
- case GGML_TYPE_Q5_K:
83
- case GGML_TYPE_Q6_K:
84
- return true;
85
- default:
86
- return false;
87
- }
88
- }
@@ -1,419 +0,0 @@
1
- #include "mmvq.cuh"
2
- #include "vecdotq.cuh"
3
-
4
- typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
5
-
6
- static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
7
- return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
8
- type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
9
- type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
10
- type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
11
- type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
12
- type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
13
- type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
14
- type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
15
- type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
16
- type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
17
- type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
18
- type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
19
- type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
20
- type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
21
- type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
22
- type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
23
- type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
24
- type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
25
- type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
26
- nullptr;
27
- }
28
-
29
- static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
30
- return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
31
- type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
32
- type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
33
- type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
34
- type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
35
- type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
36
- type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
37
- type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
38
- type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
39
- type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
40
- type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
41
- 1;
42
- }
43
-
44
- template <ggml_type type, int ncols_y>
45
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
46
- // tell the compiler to use as many registers as it wants, see nwarps definition below
47
- __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
48
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
49
- static __global__ void mul_mat_vec_q(
50
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
51
- const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
52
-
53
- constexpr int qk = ggml_cuda_type_traits<type>::qk;
54
- constexpr int qi = ggml_cuda_type_traits<type>::qi;
55
- constexpr int vdr = get_vdr_mmvq(type);
56
-
57
- constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
58
-
59
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
60
- constexpr int nwarps = 1;
61
- constexpr int rows_per_cuda_block = 1;
62
- #else
63
- constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
64
- constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
65
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
66
-
67
- const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
68
- const int row0 = rows_per_cuda_block*blockIdx.x;
69
- const int blocks_per_row_x = ncols_x / qk;
70
- const int blocks_per_col_y = nrows_y / QK8_1;
71
- constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
72
-
73
- // partial sum for each thread
74
- float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
75
-
76
- const block_q8_1 * y = (const block_q8_1 *) vy;
77
-
78
- for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
79
- const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
80
-
81
- // x block quant index when casting the quants to int
82
- const int kqs = vdr * (tid % (qi/vdr));
83
-
84
- #pragma unroll
85
- for (int j = 0; j < ncols_y; ++j) {
86
- #pragma unroll
87
- for (int i = 0; i < rows_per_cuda_block; ++i) {
88
- tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
89
- }
90
- }
91
- }
92
-
93
- __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
94
- if (threadIdx.y > 0) {
95
- #pragma unroll
96
- for (int j = 0; j < ncols_y; ++j) {
97
- #pragma unroll
98
- for (int i = 0; i < rows_per_cuda_block; ++i) {
99
- tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
100
- }
101
- }
102
- }
103
- __syncthreads();
104
- if (threadIdx.y > 0) {
105
- return;
106
- }
107
-
108
- // sum up partial sums and write back result
109
- #pragma unroll
110
- for (int j = 0; j < ncols_y; ++j) {
111
- #pragma unroll
112
- for (int i = 0; i < rows_per_cuda_block; ++i) {
113
- #pragma unroll
114
- for (int l = 0; l < nwarps-1; ++l) {
115
- tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
116
- }
117
- tmp[j][i] = warp_reduce_sum(tmp[j][i]);
118
- }
119
-
120
- if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
121
- dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
122
- }
123
- }
124
- }
125
-
126
- template <ggml_type type>
127
- static void mul_mat_vec_q_cuda(
128
- const void * vx, const void * vy, float * dst,
129
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
130
-
131
- GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
132
- GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
133
-
134
- int id = ggml_cuda_get_device();
135
-
136
- int64_t nwarps = 1;
137
- int64_t rows_per_cuda_block = 1;
138
-
139
- if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
140
- switch(ncols_y) {
141
- case 1:
142
- nwarps = 4;
143
- rows_per_cuda_block = 1;
144
- break;
145
- case 2:
146
- case 3:
147
- case 4:
148
- nwarps = 4;
149
- rows_per_cuda_block = 2;
150
- break;
151
- case 5:
152
- case 6:
153
- case 7:
154
- case 8:
155
- nwarps = 2;
156
- rows_per_cuda_block = 2;
157
- break;
158
- default:
159
- GGML_ASSERT(false);
160
- break;
161
- }
162
- }
163
- const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
164
- const dim3 block_nums(nblocks, 1, 1);
165
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
166
-
167
- switch (ncols_y) {
168
- case 1:
169
- mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
170
- break;
171
- case 2:
172
- mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
173
- break;
174
- case 3:
175
- mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
176
- break;
177
- case 4:
178
- mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
179
- break;
180
- case 5:
181
- mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
182
- break;
183
- case 6:
184
- mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
185
- break;
186
- case 7:
187
- mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
188
- break;
189
- case 8:
190
- mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
191
- break;
192
- default:
193
- GGML_ASSERT(false);
194
- break;
195
- }
196
- }
197
-
198
- static void mul_mat_vec_q4_0_q8_1_cuda(
199
- const void * vx, const void * vy, float * dst,
200
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
201
-
202
- mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
203
- }
204
-
205
- static void mul_mat_vec_q4_1_q8_1_cuda(
206
- const void * vx, const void * vy, float * dst,
207
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
208
-
209
- mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
210
- }
211
-
212
- static void mul_mat_vec_q5_0_q8_1_cuda(
213
- const void * vx, const void * vy, float * dst,
214
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
215
-
216
- mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
217
- }
218
-
219
- static void mul_mat_vec_q5_1_q8_1_cuda(
220
- const void * vx, const void * vy, float * dst,
221
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
222
-
223
- mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
224
- }
225
-
226
- static void mul_mat_vec_q8_0_q8_1_cuda(
227
- const void * vx, const void * vy, float * dst,
228
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
229
-
230
- mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
231
- }
232
-
233
- static void mul_mat_vec_q2_K_q8_1_cuda(
234
- const void * vx, const void * vy, float * dst,
235
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
236
-
237
- mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
238
- }
239
-
240
- static void mul_mat_vec_q3_K_q8_1_cuda(
241
- const void * vx, const void * vy, float * dst,
242
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
243
-
244
- mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
245
- }
246
-
247
- static void mul_mat_vec_q4_K_q8_1_cuda(
248
- const void * vx, const void * vy, float * dst,
249
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
250
-
251
- mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
252
- }
253
-
254
- static void mul_mat_vec_q5_K_q8_1_cuda(
255
- const void * vx, const void * vy, float * dst,
256
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
257
-
258
- mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
259
- }
260
-
261
- static void mul_mat_vec_q6_K_q8_1_cuda(
262
- const void * vx, const void * vy, float * dst,
263
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
264
-
265
- mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
266
- }
267
-
268
- static void mul_mat_vec_iq2_xxs_q8_1_cuda(
269
- const void * vx, const void * vy, float * dst,
270
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
271
-
272
- mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
273
- }
274
-
275
- static void mul_mat_vec_iq2_xs_q8_1_cuda(
276
- const void * vx, const void * vy, float * dst,
277
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
278
-
279
- mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
280
- }
281
-
282
- static void mul_mat_vec_iq2_s_q8_1_cuda(
283
- const void * vx, const void * vy, float * dst,
284
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
285
-
286
- mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
287
- }
288
-
289
- static void mul_mat_vec_iq3_xxs_q8_1_cuda(
290
- const void * vx, const void * vy, float * dst,
291
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
292
-
293
- mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
294
- }
295
-
296
- static void mul_mat_vec_iq1_s_q8_1_cuda(
297
- const void * vx, const void * vy, float * dst,
298
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
299
-
300
- mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
301
- }
302
-
303
- static void mul_mat_vec_iq1_m_q8_1_cuda(
304
- const void * vx, const void * vy, float * dst,
305
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
306
-
307
- mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
308
- }
309
-
310
- static void mul_mat_vec_iq4_nl_q8_1_cuda(
311
- const void * vx, const void * vy, float * dst,
312
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
313
-
314
- mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
315
- }
316
-
317
- static void mul_mat_vec_iq4_xs_q8_1_cuda(
318
- const void * vx, const void * vy, float * dst,
319
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
320
-
321
- mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
322
- }
323
-
324
- static void mul_mat_vec_iq3_s_q8_1_cuda(
325
- const void * vx, const void * vy, float * dst,
326
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
327
-
328
- mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
329
- }
330
-
331
- void ggml_cuda_op_mul_mat_vec_q(
332
- ggml_backend_cuda_context & ctx,
333
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
334
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
335
- const int64_t src1_padded_row_size, cudaStream_t stream) {
336
-
337
- const int64_t ne00 = src0->ne[0];
338
- const int64_t row_diff = row_high - row_low;
339
-
340
- const int64_t ne10 = src1->ne[0];
341
- GGML_ASSERT(ne10 % QK8_1 == 0);
342
-
343
- const int64_t ne0 = dst->ne[0];
344
-
345
- int id = ggml_cuda_get_device();
346
-
347
- // the main device has a larger memory buffer to hold the results from all GPUs
348
- // nrows_dst == nrows of the matrix that the kernel writes into
349
- const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
350
-
351
- switch (src0->type) {
352
- case GGML_TYPE_Q4_0:
353
- mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
354
- break;
355
- case GGML_TYPE_Q4_1:
356
- mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
357
- break;
358
- case GGML_TYPE_Q5_0:
359
- mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
360
- break;
361
- case GGML_TYPE_Q5_1:
362
- mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
363
- break;
364
- case GGML_TYPE_Q8_0:
365
- mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
366
- break;
367
- case GGML_TYPE_Q2_K:
368
- mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
369
- break;
370
- case GGML_TYPE_Q3_K:
371
- mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
372
- break;
373
- case GGML_TYPE_Q4_K:
374
- mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
375
- break;
376
- case GGML_TYPE_Q5_K:
377
- mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
378
- break;
379
- case GGML_TYPE_Q6_K:
380
- mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
381
- break;
382
- case GGML_TYPE_IQ2_XXS:
383
- mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
384
- break;
385
- case GGML_TYPE_IQ2_XS:
386
- mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
387
- break;
388
- case GGML_TYPE_IQ2_S:
389
- mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
390
- break;
391
- case GGML_TYPE_IQ3_XXS:
392
- mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
393
- break;
394
- case GGML_TYPE_IQ1_S:
395
- mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
396
- break;
397
- case GGML_TYPE_IQ1_M:
398
- mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
399
- break;
400
- case GGML_TYPE_IQ4_NL:
401
- mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
402
- break;
403
- case GGML_TYPE_IQ4_XS:
404
- mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
405
- break;
406
- case GGML_TYPE_IQ3_S:
407
- mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
408
- break;
409
- default:
410
- GGML_ASSERT(false);
411
- break;
412
- }
413
-
414
- GGML_UNUSED(src1);
415
- GGML_UNUSED(dst);
416
- GGML_UNUSED(src1_ddf_i);
417
- GGML_UNUSED(src1_ncols);
418
- GGML_UNUSED(src1_padded_row_size);
419
- }