llama_cpp 0.16.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -12
  4. data/ext/llama_cpp/extconf.rb +2 -43
  5. data/ext/llama_cpp/llama_cpp.cpp +8 -0
  6. data/lib/llama_cpp/version.rb +3 -3
  7. data/sig/llama_cpp.rbs +3 -0
  8. metadata +2 -171
  9. data/vendor/include/.gitkeep +0 -0
  10. data/vendor/lib/.gitkeep +0 -0
  11. data/vendor/tmp/llama.cpp/LICENSE +0 -21
  12. data/vendor/tmp/llama.cpp/Makefile +0 -1124
  13. data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
  14. data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
  15. data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
  16. data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
  17. data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
  18. data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
  19. data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
  20. data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
  21. data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
  22. data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
  23. data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
  24. data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
  25. data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
  26. data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
  27. data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
  28. data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
  29. data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
  30. data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
  31. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
  32. data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
  33. data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
  34. data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
  35. data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
  36. data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
  37. data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
  38. data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
  39. data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
  40. data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
  41. data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
  42. data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
  43. data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
  44. data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
  45. data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
  46. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  47. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  48. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  49. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  50. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  51. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  52. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  53. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  54. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  55. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  56. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  57. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  58. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  59. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  60. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  61. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  62. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  63. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  64. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  65. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  66. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  67. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  68. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  69. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  70. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  71. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  72. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  73. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  74. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  75. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  76. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  77. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  78. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  79. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  80. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  81. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  82. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  83. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  84. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  85. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  86. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  87. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  88. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  89. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  90. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  91. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  92. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  93. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  94. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  95. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  96. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  97. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  98. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  99. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  100. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  101. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  102. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  103. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  104. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  105. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  106. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  107. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  108. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  109. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  110. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  111. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  112. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  113. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  114. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  115. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  116. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  117. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  118. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  119. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  120. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  121. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  122. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  123. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  124. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  125. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  126. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  127. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  128. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  129. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  130. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  131. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  132. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
  133. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
  134. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
  135. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
  136. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
  137. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
  138. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
  139. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
  140. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
  141. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
  142. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
  143. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
  144. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
  145. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
  146. data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
  147. data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
  148. data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
  149. data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
  150. data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
  151. data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
  152. data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
  153. data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
  154. data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
  155. data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
  156. data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
  157. data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
  158. data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
  159. data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
  160. data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
  161. data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
  162. data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
  163. data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
  164. data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
  165. data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
  166. data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
  167. data/vendor/tmp/llama.cpp/ggml.c +0 -22506
  168. data/vendor/tmp/llama.cpp/ggml.h +0 -2458
  169. data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
  170. data/vendor/tmp/llama.cpp/llama.h +0 -1147
  171. data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
  172. data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
  173. data/vendor/tmp/llama.cpp/sgemm.h +0 -14
  174. data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
  175. data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
  176. data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
  177. data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,88 +0,0 @@
1
- #include "mmq.cuh"
2
-
3
- void ggml_cuda_op_mul_mat_q(
4
- ggml_backend_cuda_context & ctx,
5
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
6
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7
- const int64_t src1_padded_row_size, cudaStream_t stream) {
8
-
9
- const int64_t ne00 = src0->ne[0];
10
-
11
- const int64_t nb01 = src0->nb[1];
12
-
13
- const int64_t ne10 = src1->ne[0];
14
- const int64_t ne11 = src1->ne[1];
15
- GGML_ASSERT(ne10 % QK8_1 == 0);
16
-
17
- const int64_t ne0 = dst->ne[0];
18
-
19
- const int64_t row_diff = row_high - row_low;
20
- const int64_t stride00 = nb01 / ggml_type_size(src0->type);
21
-
22
- int id = ggml_cuda_get_device();
23
- const int compute_capability = ggml_cuda_info().devices[id].cc;
24
-
25
- // the main device has a larger memory buffer to hold the results from all GPUs
26
- // nrows_dst == nrows of the matrix that the kernel writes into
27
- const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
28
-
29
- const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
30
-
31
- switch (src0->type) {
32
- case GGML_TYPE_Q4_0:
33
- mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
34
- break;
35
- case GGML_TYPE_Q4_1:
36
- mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
37
- break;
38
- case GGML_TYPE_Q5_0:
39
- mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
40
- break;
41
- case GGML_TYPE_Q5_1:
42
- mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
43
- break;
44
- case GGML_TYPE_Q8_0:
45
- mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
46
- break;
47
- case GGML_TYPE_Q2_K:
48
- mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
49
- break;
50
- case GGML_TYPE_Q3_K:
51
- mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
52
- break;
53
- case GGML_TYPE_Q4_K:
54
- mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
55
- break;
56
- case GGML_TYPE_Q5_K:
57
- mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
58
- break;
59
- case GGML_TYPE_Q6_K:
60
- mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
61
- break;
62
- default:
63
- GGML_ASSERT(false);
64
- break;
65
- }
66
-
67
- GGML_UNUSED(src1);
68
- GGML_UNUSED(dst);
69
- GGML_UNUSED(src1_ddf_i);
70
- }
71
-
72
- bool ggml_cuda_supports_mmq(enum ggml_type type) {
73
- switch (type) {
74
- case GGML_TYPE_Q4_0:
75
- case GGML_TYPE_Q4_1:
76
- case GGML_TYPE_Q5_0:
77
- case GGML_TYPE_Q5_1:
78
- case GGML_TYPE_Q8_0:
79
- case GGML_TYPE_Q2_K:
80
- case GGML_TYPE_Q3_K:
81
- case GGML_TYPE_Q4_K:
82
- case GGML_TYPE_Q5_K:
83
- case GGML_TYPE_Q6_K:
84
- return true;
85
- default:
86
- return false;
87
- }
88
- }
@@ -1,419 +0,0 @@
1
- #include "mmvq.cuh"
2
- #include "vecdotq.cuh"
3
-
4
- typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
5
-
6
- static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
7
- return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
8
- type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
9
- type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
10
- type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
11
- type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
12
- type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
13
- type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
14
- type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
15
- type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
16
- type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
17
- type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
18
- type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
19
- type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
20
- type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
21
- type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
22
- type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
23
- type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
24
- type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
25
- type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
26
- nullptr;
27
- }
28
-
29
- static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
30
- return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
31
- type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
32
- type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
33
- type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
34
- type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
35
- type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
36
- type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
37
- type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
38
- type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
39
- type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
40
- type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
41
- 1;
42
- }
43
-
44
- template <ggml_type type, int ncols_y>
45
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
46
- // tell the compiler to use as many registers as it wants, see nwarps definition below
47
- __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
48
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
49
- static __global__ void mul_mat_vec_q(
50
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
51
- const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
52
-
53
- constexpr int qk = ggml_cuda_type_traits<type>::qk;
54
- constexpr int qi = ggml_cuda_type_traits<type>::qi;
55
- constexpr int vdr = get_vdr_mmvq(type);
56
-
57
- constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
58
-
59
- #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
60
- constexpr int nwarps = 1;
61
- constexpr int rows_per_cuda_block = 1;
62
- #else
63
- constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
64
- constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
65
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
66
-
67
- const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
68
- const int row0 = rows_per_cuda_block*blockIdx.x;
69
- const int blocks_per_row_x = ncols_x / qk;
70
- const int blocks_per_col_y = nrows_y / QK8_1;
71
- constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
72
-
73
- // partial sum for each thread
74
- float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
75
-
76
- const block_q8_1 * y = (const block_q8_1 *) vy;
77
-
78
- for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
79
- const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
80
-
81
- // x block quant index when casting the quants to int
82
- const int kqs = vdr * (tid % (qi/vdr));
83
-
84
- #pragma unroll
85
- for (int j = 0; j < ncols_y; ++j) {
86
- #pragma unroll
87
- for (int i = 0; i < rows_per_cuda_block; ++i) {
88
- tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
89
- }
90
- }
91
- }
92
-
93
- __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
94
- if (threadIdx.y > 0) {
95
- #pragma unroll
96
- for (int j = 0; j < ncols_y; ++j) {
97
- #pragma unroll
98
- for (int i = 0; i < rows_per_cuda_block; ++i) {
99
- tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
100
- }
101
- }
102
- }
103
- __syncthreads();
104
- if (threadIdx.y > 0) {
105
- return;
106
- }
107
-
108
- // sum up partial sums and write back result
109
- #pragma unroll
110
- for (int j = 0; j < ncols_y; ++j) {
111
- #pragma unroll
112
- for (int i = 0; i < rows_per_cuda_block; ++i) {
113
- #pragma unroll
114
- for (int l = 0; l < nwarps-1; ++l) {
115
- tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
116
- }
117
- tmp[j][i] = warp_reduce_sum(tmp[j][i]);
118
- }
119
-
120
- if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
121
- dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
122
- }
123
- }
124
- }
125
-
126
- template <ggml_type type>
127
- static void mul_mat_vec_q_cuda(
128
- const void * vx, const void * vy, float * dst,
129
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
130
-
131
- GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
132
- GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
133
-
134
- int id = ggml_cuda_get_device();
135
-
136
- int64_t nwarps = 1;
137
- int64_t rows_per_cuda_block = 1;
138
-
139
- if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
140
- switch(ncols_y) {
141
- case 1:
142
- nwarps = 4;
143
- rows_per_cuda_block = 1;
144
- break;
145
- case 2:
146
- case 3:
147
- case 4:
148
- nwarps = 4;
149
- rows_per_cuda_block = 2;
150
- break;
151
- case 5:
152
- case 6:
153
- case 7:
154
- case 8:
155
- nwarps = 2;
156
- rows_per_cuda_block = 2;
157
- break;
158
- default:
159
- GGML_ASSERT(false);
160
- break;
161
- }
162
- }
163
- const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
164
- const dim3 block_nums(nblocks, 1, 1);
165
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
166
-
167
- switch (ncols_y) {
168
- case 1:
169
- mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
170
- break;
171
- case 2:
172
- mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
173
- break;
174
- case 3:
175
- mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
176
- break;
177
- case 4:
178
- mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
179
- break;
180
- case 5:
181
- mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
182
- break;
183
- case 6:
184
- mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
185
- break;
186
- case 7:
187
- mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
188
- break;
189
- case 8:
190
- mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
191
- break;
192
- default:
193
- GGML_ASSERT(false);
194
- break;
195
- }
196
- }
197
-
198
- static void mul_mat_vec_q4_0_q8_1_cuda(
199
- const void * vx, const void * vy, float * dst,
200
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
201
-
202
- mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
203
- }
204
-
205
- static void mul_mat_vec_q4_1_q8_1_cuda(
206
- const void * vx, const void * vy, float * dst,
207
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
208
-
209
- mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
210
- }
211
-
212
- static void mul_mat_vec_q5_0_q8_1_cuda(
213
- const void * vx, const void * vy, float * dst,
214
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
215
-
216
- mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
217
- }
218
-
219
- static void mul_mat_vec_q5_1_q8_1_cuda(
220
- const void * vx, const void * vy, float * dst,
221
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
222
-
223
- mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
224
- }
225
-
226
- static void mul_mat_vec_q8_0_q8_1_cuda(
227
- const void * vx, const void * vy, float * dst,
228
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
229
-
230
- mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
231
- }
232
-
233
- static void mul_mat_vec_q2_K_q8_1_cuda(
234
- const void * vx, const void * vy, float * dst,
235
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
236
-
237
- mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
238
- }
239
-
240
- static void mul_mat_vec_q3_K_q8_1_cuda(
241
- const void * vx, const void * vy, float * dst,
242
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
243
-
244
- mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
245
- }
246
-
247
- static void mul_mat_vec_q4_K_q8_1_cuda(
248
- const void * vx, const void * vy, float * dst,
249
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
250
-
251
- mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
252
- }
253
-
254
- static void mul_mat_vec_q5_K_q8_1_cuda(
255
- const void * vx, const void * vy, float * dst,
256
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
257
-
258
- mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
259
- }
260
-
261
- static void mul_mat_vec_q6_K_q8_1_cuda(
262
- const void * vx, const void * vy, float * dst,
263
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
264
-
265
- mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
266
- }
267
-
268
- static void mul_mat_vec_iq2_xxs_q8_1_cuda(
269
- const void * vx, const void * vy, float * dst,
270
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
271
-
272
- mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
273
- }
274
-
275
- static void mul_mat_vec_iq2_xs_q8_1_cuda(
276
- const void * vx, const void * vy, float * dst,
277
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
278
-
279
- mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
280
- }
281
-
282
- static void mul_mat_vec_iq2_s_q8_1_cuda(
283
- const void * vx, const void * vy, float * dst,
284
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
285
-
286
- mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
287
- }
288
-
289
- static void mul_mat_vec_iq3_xxs_q8_1_cuda(
290
- const void * vx, const void * vy, float * dst,
291
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
292
-
293
- mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
294
- }
295
-
296
- static void mul_mat_vec_iq1_s_q8_1_cuda(
297
- const void * vx, const void * vy, float * dst,
298
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
299
-
300
- mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
301
- }
302
-
303
- static void mul_mat_vec_iq1_m_q8_1_cuda(
304
- const void * vx, const void * vy, float * dst,
305
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
306
-
307
- mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
308
- }
309
-
310
- static void mul_mat_vec_iq4_nl_q8_1_cuda(
311
- const void * vx, const void * vy, float * dst,
312
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
313
-
314
- mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
315
- }
316
-
317
- static void mul_mat_vec_iq4_xs_q8_1_cuda(
318
- const void * vx, const void * vy, float * dst,
319
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
320
-
321
- mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
322
- }
323
-
324
- static void mul_mat_vec_iq3_s_q8_1_cuda(
325
- const void * vx, const void * vy, float * dst,
326
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
327
-
328
- mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
329
- }
330
-
331
- void ggml_cuda_op_mul_mat_vec_q(
332
- ggml_backend_cuda_context & ctx,
333
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
334
- const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
335
- const int64_t src1_padded_row_size, cudaStream_t stream) {
336
-
337
- const int64_t ne00 = src0->ne[0];
338
- const int64_t row_diff = row_high - row_low;
339
-
340
- const int64_t ne10 = src1->ne[0];
341
- GGML_ASSERT(ne10 % QK8_1 == 0);
342
-
343
- const int64_t ne0 = dst->ne[0];
344
-
345
- int id = ggml_cuda_get_device();
346
-
347
- // the main device has a larger memory buffer to hold the results from all GPUs
348
- // nrows_dst == nrows of the matrix that the kernel writes into
349
- const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
350
-
351
- switch (src0->type) {
352
- case GGML_TYPE_Q4_0:
353
- mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
354
- break;
355
- case GGML_TYPE_Q4_1:
356
- mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
357
- break;
358
- case GGML_TYPE_Q5_0:
359
- mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
360
- break;
361
- case GGML_TYPE_Q5_1:
362
- mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
363
- break;
364
- case GGML_TYPE_Q8_0:
365
- mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
366
- break;
367
- case GGML_TYPE_Q2_K:
368
- mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
369
- break;
370
- case GGML_TYPE_Q3_K:
371
- mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
372
- break;
373
- case GGML_TYPE_Q4_K:
374
- mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
375
- break;
376
- case GGML_TYPE_Q5_K:
377
- mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
378
- break;
379
- case GGML_TYPE_Q6_K:
380
- mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
381
- break;
382
- case GGML_TYPE_IQ2_XXS:
383
- mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
384
- break;
385
- case GGML_TYPE_IQ2_XS:
386
- mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
387
- break;
388
- case GGML_TYPE_IQ2_S:
389
- mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
390
- break;
391
- case GGML_TYPE_IQ3_XXS:
392
- mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
393
- break;
394
- case GGML_TYPE_IQ1_S:
395
- mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
396
- break;
397
- case GGML_TYPE_IQ1_M:
398
- mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
399
- break;
400
- case GGML_TYPE_IQ4_NL:
401
- mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
402
- break;
403
- case GGML_TYPE_IQ4_XS:
404
- mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
405
- break;
406
- case GGML_TYPE_IQ3_S:
407
- mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
408
- break;
409
- default:
410
- GGML_ASSERT(false);
411
- break;
412
- }
413
-
414
- GGML_UNUSED(src1);
415
- GGML_UNUSED(dst);
416
- GGML_UNUSED(src1_ddf_i);
417
- GGML_UNUSED(src1_ncols);
418
- GGML_UNUSED(src1_padded_row_size);
419
- }