llama_cpp 0.15.4 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (161) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/ext/llama_cpp/extconf.rb +3 -2
  4. data/ext/llama_cpp/llama_cpp.cpp +17 -3
  5. data/lib/llama_cpp/version.rb +2 -2
  6. data/sig/llama_cpp.rbs +15 -1
  7. data/vendor/tmp/llama.cpp/Makefile +166 -82
  8. data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
  9. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
  10. data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
  11. data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
  12. data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
  13. data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
  14. data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
  15. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
  16. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
  17. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
  18. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
  19. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
  20. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
  21. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
  22. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
  23. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
  24. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
  25. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
  26. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
  27. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
  28. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
  29. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
  30. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
  31. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
  32. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
  33. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
  34. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
  35. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
  36. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
  37. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
  38. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
  39. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
  40. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  41. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  42. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  43. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  44. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  45. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  141. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
  142. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
  143. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
  144. data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
  145. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
  146. data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
  147. data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
  148. data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
  149. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
  150. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
  151. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
  152. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
  153. data/vendor/tmp/llama.cpp/ggml.c +278 -603
  154. data/vendor/tmp/llama.cpp/ggml.h +9 -28
  155. data/vendor/tmp/llama.cpp/llama.cpp +345 -473
  156. data/vendor/tmp/llama.cpp/llama.h +21 -43
  157. metadata +134 -7
  158. data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
  159. data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
  160. data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
  161. data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
@@ -0,0 +1,88 @@
1
+ #include "mmq.cuh"
2
+
3
+ void ggml_cuda_op_mul_mat_q(
4
+ ggml_backend_cuda_context & ctx,
5
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
6
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
8
+
9
+ const int64_t ne00 = src0->ne[0];
10
+
11
+ const int64_t nb01 = src0->nb[1];
12
+
13
+ const int64_t ne10 = src1->ne[0];
14
+ const int64_t ne11 = src1->ne[1];
15
+ GGML_ASSERT(ne10 % QK8_1 == 0);
16
+
17
+ const int64_t ne0 = dst->ne[0];
18
+
19
+ const int64_t row_diff = row_high - row_low;
20
+ const int64_t stride00 = nb01 / ggml_type_size(src0->type);
21
+
22
+ int id = ggml_cuda_get_device();
23
+ const int compute_capability = ggml_cuda_info().devices[id].cc;
24
+
25
+ // the main device has a larger memory buffer to hold the results from all GPUs
26
+ // nrows_dst == nrows of the matrix that the kernel writes into
27
+ const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
28
+
29
+ const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
30
+
31
+ switch (src0->type) {
32
+ case GGML_TYPE_Q4_0:
33
+ mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
34
+ break;
35
+ case GGML_TYPE_Q4_1:
36
+ mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
37
+ break;
38
+ case GGML_TYPE_Q5_0:
39
+ mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
40
+ break;
41
+ case GGML_TYPE_Q5_1:
42
+ mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
43
+ break;
44
+ case GGML_TYPE_Q8_0:
45
+ mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
46
+ break;
47
+ case GGML_TYPE_Q2_K:
48
+ mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
49
+ break;
50
+ case GGML_TYPE_Q3_K:
51
+ mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
52
+ break;
53
+ case GGML_TYPE_Q4_K:
54
+ mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
55
+ break;
56
+ case GGML_TYPE_Q5_K:
57
+ mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
58
+ break;
59
+ case GGML_TYPE_Q6_K:
60
+ mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
61
+ break;
62
+ default:
63
+ GGML_ASSERT(false);
64
+ break;
65
+ }
66
+
67
+ GGML_UNUSED(src1);
68
+ GGML_UNUSED(dst);
69
+ GGML_UNUSED(src1_ddf_i);
70
+ }
71
+
72
+ bool ggml_cuda_supports_mmq(enum ggml_type type) {
73
+ switch (type) {
74
+ case GGML_TYPE_Q4_0:
75
+ case GGML_TYPE_Q4_1:
76
+ case GGML_TYPE_Q5_0:
77
+ case GGML_TYPE_Q5_1:
78
+ case GGML_TYPE_Q8_0:
79
+ case GGML_TYPE_Q2_K:
80
+ case GGML_TYPE_Q3_K:
81
+ case GGML_TYPE_Q4_K:
82
+ case GGML_TYPE_Q5_K:
83
+ case GGML_TYPE_Q6_K:
84
+ return true;
85
+ default:
86
+ return false;
87
+ }
88
+ }
@@ -0,0 +1,419 @@
1
+ #include "mmvq.cuh"
2
+ #include "vecdotq.cuh"
3
+
4
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
5
+
6
+ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
7
+ return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
8
+ type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
9
+ type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
10
+ type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
11
+ type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
12
+ type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
13
+ type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
14
+ type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
15
+ type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
16
+ type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
17
+ type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
18
+ type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
19
+ type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
20
+ type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
21
+ type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
22
+ type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
23
+ type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
24
+ type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
25
+ type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
26
+ nullptr;
27
+ }
28
+
29
+ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
30
+ return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
31
+ type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
32
+ type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
33
+ type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
34
+ type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
35
+ type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
36
+ type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
37
+ type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
38
+ type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
39
+ type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
40
+ type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
41
+ 1;
42
+ }
43
+
44
+ template <ggml_type type, int ncols_y>
45
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
46
+ // tell the compiler to use as many registers as it wants, see nwarps definition below
47
+ __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
48
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
49
+ static __global__ void mul_mat_vec_q(
50
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
51
+ const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
52
+
53
+ constexpr int qk = ggml_cuda_type_traits<type>::qk;
54
+ constexpr int qi = ggml_cuda_type_traits<type>::qi;
55
+ constexpr int vdr = get_vdr_mmvq(type);
56
+
57
+ constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
58
+
59
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
60
+ constexpr int nwarps = 1;
61
+ constexpr int rows_per_cuda_block = 1;
62
+ #else
63
+ constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
64
+ constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
65
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
66
+
67
+ const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
68
+ const int row0 = rows_per_cuda_block*blockIdx.x;
69
+ const int blocks_per_row_x = ncols_x / qk;
70
+ const int blocks_per_col_y = nrows_y / QK8_1;
71
+ constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
72
+
73
+ // partial sum for each thread
74
+ float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
75
+
76
+ const block_q8_1 * y = (const block_q8_1 *) vy;
77
+
78
+ for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
79
+ const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
80
+
81
+ // x block quant index when casting the quants to int
82
+ const int kqs = vdr * (tid % (qi/vdr));
83
+
84
+ #pragma unroll
85
+ for (int j = 0; j < ncols_y; ++j) {
86
+ #pragma unroll
87
+ for (int i = 0; i < rows_per_cuda_block; ++i) {
88
+ tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
89
+ }
90
+ }
91
+ }
92
+
93
+ __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
94
+ if (threadIdx.y > 0) {
95
+ #pragma unroll
96
+ for (int j = 0; j < ncols_y; ++j) {
97
+ #pragma unroll
98
+ for (int i = 0; i < rows_per_cuda_block; ++i) {
99
+ tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
100
+ }
101
+ }
102
+ }
103
+ __syncthreads();
104
+ if (threadIdx.y > 0) {
105
+ return;
106
+ }
107
+
108
+ // sum up partial sums and write back result
109
+ #pragma unroll
110
+ for (int j = 0; j < ncols_y; ++j) {
111
+ #pragma unroll
112
+ for (int i = 0; i < rows_per_cuda_block; ++i) {
113
+ #pragma unroll
114
+ for (int l = 0; l < nwarps-1; ++l) {
115
+ tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
116
+ }
117
+ tmp[j][i] = warp_reduce_sum(tmp[j][i]);
118
+ }
119
+
120
+ if (threadIdx.x < rows_per_cuda_block) {
121
+ dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
122
+ }
123
+ }
124
+ }
125
+
126
+ template <ggml_type type>
127
+ static void mul_mat_vec_q_cuda(
128
+ const void * vx, const void * vy, float * dst,
129
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
130
+
131
+ GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
132
+ GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
133
+
134
+ int id = ggml_cuda_get_device();
135
+
136
+ int64_t nwarps = 1;
137
+ int64_t rows_per_cuda_block = 1;
138
+
139
+ if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
140
+ switch(ncols_y) {
141
+ case 1:
142
+ nwarps = 4;
143
+ rows_per_cuda_block = 1;
144
+ break;
145
+ case 2:
146
+ case 3:
147
+ case 4:
148
+ nwarps = 4;
149
+ rows_per_cuda_block = 2;
150
+ break;
151
+ case 5:
152
+ case 6:
153
+ case 7:
154
+ case 8:
155
+ nwarps = 2;
156
+ rows_per_cuda_block = 2;
157
+ break;
158
+ default:
159
+ GGML_ASSERT(false);
160
+ break;
161
+ }
162
+ }
163
+ const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
164
+ const dim3 block_nums(nblocks, 1, 1);
165
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
166
+
167
+ switch (ncols_y) {
168
+ case 1:
169
+ mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
170
+ break;
171
+ case 2:
172
+ mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
173
+ break;
174
+ case 3:
175
+ mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
176
+ break;
177
+ case 4:
178
+ mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
179
+ break;
180
+ case 5:
181
+ mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
182
+ break;
183
+ case 6:
184
+ mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
185
+ break;
186
+ case 7:
187
+ mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
188
+ break;
189
+ case 8:
190
+ mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
191
+ break;
192
+ default:
193
+ GGML_ASSERT(false);
194
+ break;
195
+ }
196
+ }
197
+
198
+ static void mul_mat_vec_q4_0_q8_1_cuda(
199
+ const void * vx, const void * vy, float * dst,
200
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
201
+
202
+ mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
203
+ }
204
+
205
+ static void mul_mat_vec_q4_1_q8_1_cuda(
206
+ const void * vx, const void * vy, float * dst,
207
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
208
+
209
+ mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
210
+ }
211
+
212
+ static void mul_mat_vec_q5_0_q8_1_cuda(
213
+ const void * vx, const void * vy, float * dst,
214
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
215
+
216
+ mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
217
+ }
218
+
219
+ static void mul_mat_vec_q5_1_q8_1_cuda(
220
+ const void * vx, const void * vy, float * dst,
221
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
222
+
223
+ mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
224
+ }
225
+
226
+ static void mul_mat_vec_q8_0_q8_1_cuda(
227
+ const void * vx, const void * vy, float * dst,
228
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
229
+
230
+ mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
231
+ }
232
+
233
+ static void mul_mat_vec_q2_K_q8_1_cuda(
234
+ const void * vx, const void * vy, float * dst,
235
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
236
+
237
+ mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
238
+ }
239
+
240
+ static void mul_mat_vec_q3_K_q8_1_cuda(
241
+ const void * vx, const void * vy, float * dst,
242
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
243
+
244
+ mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
245
+ }
246
+
247
+ static void mul_mat_vec_q4_K_q8_1_cuda(
248
+ const void * vx, const void * vy, float * dst,
249
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
250
+
251
+ mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
252
+ }
253
+
254
+ static void mul_mat_vec_q5_K_q8_1_cuda(
255
+ const void * vx, const void * vy, float * dst,
256
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
257
+
258
+ mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
259
+ }
260
+
261
+ static void mul_mat_vec_q6_K_q8_1_cuda(
262
+ const void * vx, const void * vy, float * dst,
263
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
264
+
265
+ mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
266
+ }
267
+
268
+ static void mul_mat_vec_iq2_xxs_q8_1_cuda(
269
+ const void * vx, const void * vy, float * dst,
270
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
271
+
272
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
273
+ }
274
+
275
+ static void mul_mat_vec_iq2_xs_q8_1_cuda(
276
+ const void * vx, const void * vy, float * dst,
277
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
278
+
279
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
280
+ }
281
+
282
+ static void mul_mat_vec_iq2_s_q8_1_cuda(
283
+ const void * vx, const void * vy, float * dst,
284
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
285
+
286
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
287
+ }
288
+
289
+ static void mul_mat_vec_iq3_xxs_q8_1_cuda(
290
+ const void * vx, const void * vy, float * dst,
291
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
292
+
293
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
294
+ }
295
+
296
+ static void mul_mat_vec_iq1_s_q8_1_cuda(
297
+ const void * vx, const void * vy, float * dst,
298
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
299
+
300
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
301
+ }
302
+
303
+ static void mul_mat_vec_iq1_m_q8_1_cuda(
304
+ const void * vx, const void * vy, float * dst,
305
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
306
+
307
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
308
+ }
309
+
310
+ static void mul_mat_vec_iq4_nl_q8_1_cuda(
311
+ const void * vx, const void * vy, float * dst,
312
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
313
+
314
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
315
+ }
316
+
317
+ static void mul_mat_vec_iq4_xs_q8_1_cuda(
318
+ const void * vx, const void * vy, float * dst,
319
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
320
+
321
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
322
+ }
323
+
324
+ static void mul_mat_vec_iq3_s_q8_1_cuda(
325
+ const void * vx, const void * vy, float * dst,
326
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
327
+
328
+ mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
329
+ }
330
+
331
+ void ggml_cuda_op_mul_mat_vec_q(
332
+ ggml_backend_cuda_context & ctx,
333
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
334
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
335
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
336
+
337
+ const int64_t ne00 = src0->ne[0];
338
+ const int64_t row_diff = row_high - row_low;
339
+
340
+ const int64_t ne10 = src1->ne[0];
341
+ GGML_ASSERT(ne10 % QK8_1 == 0);
342
+
343
+ const int64_t ne0 = dst->ne[0];
344
+
345
+ int id = ggml_cuda_get_device();
346
+
347
+ // the main device has a larger memory buffer to hold the results from all GPUs
348
+ // nrows_dst == nrows of the matrix that the kernel writes into
349
+ const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
350
+
351
+ switch (src0->type) {
352
+ case GGML_TYPE_Q4_0:
353
+ mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
354
+ break;
355
+ case GGML_TYPE_Q4_1:
356
+ mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
357
+ break;
358
+ case GGML_TYPE_Q5_0:
359
+ mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
360
+ break;
361
+ case GGML_TYPE_Q5_1:
362
+ mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
363
+ break;
364
+ case GGML_TYPE_Q8_0:
365
+ mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
366
+ break;
367
+ case GGML_TYPE_Q2_K:
368
+ mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
369
+ break;
370
+ case GGML_TYPE_Q3_K:
371
+ mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
372
+ break;
373
+ case GGML_TYPE_Q4_K:
374
+ mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
375
+ break;
376
+ case GGML_TYPE_Q5_K:
377
+ mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
378
+ break;
379
+ case GGML_TYPE_Q6_K:
380
+ mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
381
+ break;
382
+ case GGML_TYPE_IQ2_XXS:
383
+ mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
384
+ break;
385
+ case GGML_TYPE_IQ2_XS:
386
+ mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
387
+ break;
388
+ case GGML_TYPE_IQ2_S:
389
+ mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
390
+ break;
391
+ case GGML_TYPE_IQ3_XXS:
392
+ mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
393
+ break;
394
+ case GGML_TYPE_IQ1_S:
395
+ mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
396
+ break;
397
+ case GGML_TYPE_IQ1_M:
398
+ mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
399
+ break;
400
+ case GGML_TYPE_IQ4_NL:
401
+ mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
402
+ break;
403
+ case GGML_TYPE_IQ4_XS:
404
+ mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
405
+ break;
406
+ case GGML_TYPE_IQ3_S:
407
+ mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
408
+ break;
409
+ default:
410
+ GGML_ASSERT(false);
411
+ break;
412
+ }
413
+
414
+ GGML_UNUSED(src1);
415
+ GGML_UNUSED(dst);
416
+ GGML_UNUSED(src1_ddf_i);
417
+ GGML_UNUSED(src1_ncols);
418
+ GGML_UNUSED(src1_padded_row_size);
419
+ }