llama_cpp 0.16.2 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +7 -12
- data/ext/llama_cpp/extconf.rb +2 -43
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/lib/llama_cpp/version.rb +3 -3
- data/sig/llama_cpp.rbs +3 -0
- metadata +2 -171
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/LICENSE +0 -21
- data/vendor/tmp/llama.cpp/Makefile +0 -1124
- data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
- data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
- data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
- data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
- data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
- data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
- data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
- data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
- data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
- data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
- data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
- data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
- data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
- data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
- data/vendor/tmp/llama.cpp/ggml.c +0 -22506
- data/vendor/tmp/llama.cpp/ggml.h +0 -2458
- data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
- data/vendor/tmp/llama.cpp/llama.h +0 -1147
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
- data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
- data/vendor/tmp/llama.cpp/sgemm.h +0 -14
- data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
- data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
- data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
- data/vendor/tmp/llama.cpp/unicode.h +0 -63
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
#include "mmq.cuh"
|
|
2
|
-
|
|
3
|
-
void ggml_cuda_op_mul_mat_q(
|
|
4
|
-
ggml_backend_cuda_context & ctx,
|
|
5
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
6
|
-
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
7
|
-
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
|
8
|
-
|
|
9
|
-
const int64_t ne00 = src0->ne[0];
|
|
10
|
-
|
|
11
|
-
const int64_t nb01 = src0->nb[1];
|
|
12
|
-
|
|
13
|
-
const int64_t ne10 = src1->ne[0];
|
|
14
|
-
const int64_t ne11 = src1->ne[1];
|
|
15
|
-
GGML_ASSERT(ne10 % QK8_1 == 0);
|
|
16
|
-
|
|
17
|
-
const int64_t ne0 = dst->ne[0];
|
|
18
|
-
|
|
19
|
-
const int64_t row_diff = row_high - row_low;
|
|
20
|
-
const int64_t stride00 = nb01 / ggml_type_size(src0->type);
|
|
21
|
-
|
|
22
|
-
int id = ggml_cuda_get_device();
|
|
23
|
-
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
|
24
|
-
|
|
25
|
-
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
26
|
-
// nrows_dst == nrows of the matrix that the kernel writes into
|
|
27
|
-
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
|
28
|
-
|
|
29
|
-
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
|
|
30
|
-
|
|
31
|
-
switch (src0->type) {
|
|
32
|
-
case GGML_TYPE_Q4_0:
|
|
33
|
-
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
|
34
|
-
break;
|
|
35
|
-
case GGML_TYPE_Q4_1:
|
|
36
|
-
mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
|
|
37
|
-
break;
|
|
38
|
-
case GGML_TYPE_Q5_0:
|
|
39
|
-
mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
|
|
40
|
-
break;
|
|
41
|
-
case GGML_TYPE_Q5_1:
|
|
42
|
-
mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
|
|
43
|
-
break;
|
|
44
|
-
case GGML_TYPE_Q8_0:
|
|
45
|
-
mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
|
|
46
|
-
break;
|
|
47
|
-
case GGML_TYPE_Q2_K:
|
|
48
|
-
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
|
|
49
|
-
break;
|
|
50
|
-
case GGML_TYPE_Q3_K:
|
|
51
|
-
mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
|
|
52
|
-
break;
|
|
53
|
-
case GGML_TYPE_Q4_K:
|
|
54
|
-
mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
|
|
55
|
-
break;
|
|
56
|
-
case GGML_TYPE_Q5_K:
|
|
57
|
-
mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
|
|
58
|
-
break;
|
|
59
|
-
case GGML_TYPE_Q6_K:
|
|
60
|
-
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
|
61
|
-
break;
|
|
62
|
-
default:
|
|
63
|
-
GGML_ASSERT(false);
|
|
64
|
-
break;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
GGML_UNUSED(src1);
|
|
68
|
-
GGML_UNUSED(dst);
|
|
69
|
-
GGML_UNUSED(src1_ddf_i);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
|
73
|
-
switch (type) {
|
|
74
|
-
case GGML_TYPE_Q4_0:
|
|
75
|
-
case GGML_TYPE_Q4_1:
|
|
76
|
-
case GGML_TYPE_Q5_0:
|
|
77
|
-
case GGML_TYPE_Q5_1:
|
|
78
|
-
case GGML_TYPE_Q8_0:
|
|
79
|
-
case GGML_TYPE_Q2_K:
|
|
80
|
-
case GGML_TYPE_Q3_K:
|
|
81
|
-
case GGML_TYPE_Q4_K:
|
|
82
|
-
case GGML_TYPE_Q5_K:
|
|
83
|
-
case GGML_TYPE_Q6_K:
|
|
84
|
-
return true;
|
|
85
|
-
default:
|
|
86
|
-
return false;
|
|
87
|
-
}
|
|
88
|
-
}
|
|
@@ -1,419 +0,0 @@
|
|
|
1
|
-
#include "mmvq.cuh"
|
|
2
|
-
#include "vecdotq.cuh"
|
|
3
|
-
|
|
4
|
-
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
|
|
5
|
-
|
|
6
|
-
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
|
|
7
|
-
return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
|
|
8
|
-
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
|
|
9
|
-
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
|
|
10
|
-
type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
|
|
11
|
-
type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
|
|
12
|
-
type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
|
|
13
|
-
type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
|
|
14
|
-
type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
|
|
15
|
-
type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
|
|
16
|
-
type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
|
|
17
|
-
type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
|
|
18
|
-
type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
|
|
19
|
-
type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
|
|
20
|
-
type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
|
|
21
|
-
type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
|
|
22
|
-
type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
|
|
23
|
-
type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
|
|
24
|
-
type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
|
|
25
|
-
type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
|
|
26
|
-
nullptr;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
|
30
|
-
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
|
31
|
-
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
|
32
|
-
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
|
33
|
-
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
|
34
|
-
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
|
35
|
-
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
|
36
|
-
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
|
37
|
-
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
|
38
|
-
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
|
39
|
-
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
|
40
|
-
type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
|
|
41
|
-
1;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
template <ggml_type type, int ncols_y>
|
|
45
|
-
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
46
|
-
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
|
47
|
-
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
|
48
|
-
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
49
|
-
static __global__ void mul_mat_vec_q(
|
|
50
|
-
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
51
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
|
52
|
-
|
|
53
|
-
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
|
54
|
-
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
|
55
|
-
constexpr int vdr = get_vdr_mmvq(type);
|
|
56
|
-
|
|
57
|
-
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
|
58
|
-
|
|
59
|
-
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
|
60
|
-
constexpr int nwarps = 1;
|
|
61
|
-
constexpr int rows_per_cuda_block = 1;
|
|
62
|
-
#else
|
|
63
|
-
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
|
64
|
-
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
|
65
|
-
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
|
66
|
-
|
|
67
|
-
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
|
68
|
-
const int row0 = rows_per_cuda_block*blockIdx.x;
|
|
69
|
-
const int blocks_per_row_x = ncols_x / qk;
|
|
70
|
-
const int blocks_per_col_y = nrows_y / QK8_1;
|
|
71
|
-
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
|
72
|
-
|
|
73
|
-
// partial sum for each thread
|
|
74
|
-
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
|
75
|
-
|
|
76
|
-
const block_q8_1 * y = (const block_q8_1 *) vy;
|
|
77
|
-
|
|
78
|
-
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
|
|
79
|
-
const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
|
|
80
|
-
|
|
81
|
-
// x block quant index when casting the quants to int
|
|
82
|
-
const int kqs = vdr * (tid % (qi/vdr));
|
|
83
|
-
|
|
84
|
-
#pragma unroll
|
|
85
|
-
for (int j = 0; j < ncols_y; ++j) {
|
|
86
|
-
#pragma unroll
|
|
87
|
-
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
|
88
|
-
tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
|
|
94
|
-
if (threadIdx.y > 0) {
|
|
95
|
-
#pragma unroll
|
|
96
|
-
for (int j = 0; j < ncols_y; ++j) {
|
|
97
|
-
#pragma unroll
|
|
98
|
-
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
|
99
|
-
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
__syncthreads();
|
|
104
|
-
if (threadIdx.y > 0) {
|
|
105
|
-
return;
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
// sum up partial sums and write back result
|
|
109
|
-
#pragma unroll
|
|
110
|
-
for (int j = 0; j < ncols_y; ++j) {
|
|
111
|
-
#pragma unroll
|
|
112
|
-
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
|
113
|
-
#pragma unroll
|
|
114
|
-
for (int l = 0; l < nwarps-1; ++l) {
|
|
115
|
-
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
|
|
116
|
-
}
|
|
117
|
-
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
|
|
121
|
-
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
template <ggml_type type>
|
|
127
|
-
static void mul_mat_vec_q_cuda(
|
|
128
|
-
const void * vx, const void * vy, float * dst,
|
|
129
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
130
|
-
|
|
131
|
-
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
|
|
132
|
-
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
|
133
|
-
|
|
134
|
-
int id = ggml_cuda_get_device();
|
|
135
|
-
|
|
136
|
-
int64_t nwarps = 1;
|
|
137
|
-
int64_t rows_per_cuda_block = 1;
|
|
138
|
-
|
|
139
|
-
if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
|
140
|
-
switch(ncols_y) {
|
|
141
|
-
case 1:
|
|
142
|
-
nwarps = 4;
|
|
143
|
-
rows_per_cuda_block = 1;
|
|
144
|
-
break;
|
|
145
|
-
case 2:
|
|
146
|
-
case 3:
|
|
147
|
-
case 4:
|
|
148
|
-
nwarps = 4;
|
|
149
|
-
rows_per_cuda_block = 2;
|
|
150
|
-
break;
|
|
151
|
-
case 5:
|
|
152
|
-
case 6:
|
|
153
|
-
case 7:
|
|
154
|
-
case 8:
|
|
155
|
-
nwarps = 2;
|
|
156
|
-
rows_per_cuda_block = 2;
|
|
157
|
-
break;
|
|
158
|
-
default:
|
|
159
|
-
GGML_ASSERT(false);
|
|
160
|
-
break;
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
|
|
164
|
-
const dim3 block_nums(nblocks, 1, 1);
|
|
165
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
|
166
|
-
|
|
167
|
-
switch (ncols_y) {
|
|
168
|
-
case 1:
|
|
169
|
-
mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
|
170
|
-
break;
|
|
171
|
-
case 2:
|
|
172
|
-
mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
|
173
|
-
break;
|
|
174
|
-
case 3:
|
|
175
|
-
mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
|
176
|
-
break;
|
|
177
|
-
case 4:
|
|
178
|
-
mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
|
179
|
-
break;
|
|
180
|
-
case 5:
|
|
181
|
-
mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
|
182
|
-
break;
|
|
183
|
-
case 6:
|
|
184
|
-
mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
|
185
|
-
break;
|
|
186
|
-
case 7:
|
|
187
|
-
mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
|
188
|
-
break;
|
|
189
|
-
case 8:
|
|
190
|
-
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
|
191
|
-
break;
|
|
192
|
-
default:
|
|
193
|
-
GGML_ASSERT(false);
|
|
194
|
-
break;
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
static void mul_mat_vec_q4_0_q8_1_cuda(
|
|
199
|
-
const void * vx, const void * vy, float * dst,
|
|
200
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
201
|
-
|
|
202
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
static void mul_mat_vec_q4_1_q8_1_cuda(
|
|
206
|
-
const void * vx, const void * vy, float * dst,
|
|
207
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
208
|
-
|
|
209
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
static void mul_mat_vec_q5_0_q8_1_cuda(
|
|
213
|
-
const void * vx, const void * vy, float * dst,
|
|
214
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
215
|
-
|
|
216
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
static void mul_mat_vec_q5_1_q8_1_cuda(
|
|
220
|
-
const void * vx, const void * vy, float * dst,
|
|
221
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
222
|
-
|
|
223
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
static void mul_mat_vec_q8_0_q8_1_cuda(
|
|
227
|
-
const void * vx, const void * vy, float * dst,
|
|
228
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
229
|
-
|
|
230
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
static void mul_mat_vec_q2_K_q8_1_cuda(
|
|
234
|
-
const void * vx, const void * vy, float * dst,
|
|
235
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
236
|
-
|
|
237
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
static void mul_mat_vec_q3_K_q8_1_cuda(
|
|
241
|
-
const void * vx, const void * vy, float * dst,
|
|
242
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
243
|
-
|
|
244
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
static void mul_mat_vec_q4_K_q8_1_cuda(
|
|
248
|
-
const void * vx, const void * vy, float * dst,
|
|
249
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
250
|
-
|
|
251
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
static void mul_mat_vec_q5_K_q8_1_cuda(
|
|
255
|
-
const void * vx, const void * vy, float * dst,
|
|
256
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
257
|
-
|
|
258
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
static void mul_mat_vec_q6_K_q8_1_cuda(
|
|
262
|
-
const void * vx, const void * vy, float * dst,
|
|
263
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
264
|
-
|
|
265
|
-
mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
static void mul_mat_vec_iq2_xxs_q8_1_cuda(
|
|
269
|
-
const void * vx, const void * vy, float * dst,
|
|
270
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
271
|
-
|
|
272
|
-
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
static void mul_mat_vec_iq2_xs_q8_1_cuda(
|
|
276
|
-
const void * vx, const void * vy, float * dst,
|
|
277
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
278
|
-
|
|
279
|
-
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
static void mul_mat_vec_iq2_s_q8_1_cuda(
|
|
283
|
-
const void * vx, const void * vy, float * dst,
|
|
284
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
285
|
-
|
|
286
|
-
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
static void mul_mat_vec_iq3_xxs_q8_1_cuda(
|
|
290
|
-
const void * vx, const void * vy, float * dst,
|
|
291
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
292
|
-
|
|
293
|
-
mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
static void mul_mat_vec_iq1_s_q8_1_cuda(
|
|
297
|
-
const void * vx, const void * vy, float * dst,
|
|
298
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
299
|
-
|
|
300
|
-
mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
static void mul_mat_vec_iq1_m_q8_1_cuda(
|
|
304
|
-
const void * vx, const void * vy, float * dst,
|
|
305
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
306
|
-
|
|
307
|
-
mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
static void mul_mat_vec_iq4_nl_q8_1_cuda(
|
|
311
|
-
const void * vx, const void * vy, float * dst,
|
|
312
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
313
|
-
|
|
314
|
-
mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
static void mul_mat_vec_iq4_xs_q8_1_cuda(
|
|
318
|
-
const void * vx, const void * vy, float * dst,
|
|
319
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
320
|
-
|
|
321
|
-
mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
static void mul_mat_vec_iq3_s_q8_1_cuda(
|
|
325
|
-
const void * vx, const void * vy, float * dst,
|
|
326
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
|
327
|
-
|
|
328
|
-
mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
void ggml_cuda_op_mul_mat_vec_q(
|
|
332
|
-
ggml_backend_cuda_context & ctx,
|
|
333
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
334
|
-
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
335
|
-
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
|
336
|
-
|
|
337
|
-
const int64_t ne00 = src0->ne[0];
|
|
338
|
-
const int64_t row_diff = row_high - row_low;
|
|
339
|
-
|
|
340
|
-
const int64_t ne10 = src1->ne[0];
|
|
341
|
-
GGML_ASSERT(ne10 % QK8_1 == 0);
|
|
342
|
-
|
|
343
|
-
const int64_t ne0 = dst->ne[0];
|
|
344
|
-
|
|
345
|
-
int id = ggml_cuda_get_device();
|
|
346
|
-
|
|
347
|
-
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
348
|
-
// nrows_dst == nrows of the matrix that the kernel writes into
|
|
349
|
-
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
|
350
|
-
|
|
351
|
-
switch (src0->type) {
|
|
352
|
-
case GGML_TYPE_Q4_0:
|
|
353
|
-
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
354
|
-
break;
|
|
355
|
-
case GGML_TYPE_Q4_1:
|
|
356
|
-
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
357
|
-
break;
|
|
358
|
-
case GGML_TYPE_Q5_0:
|
|
359
|
-
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
360
|
-
break;
|
|
361
|
-
case GGML_TYPE_Q5_1:
|
|
362
|
-
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
363
|
-
break;
|
|
364
|
-
case GGML_TYPE_Q8_0:
|
|
365
|
-
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
366
|
-
break;
|
|
367
|
-
case GGML_TYPE_Q2_K:
|
|
368
|
-
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
369
|
-
break;
|
|
370
|
-
case GGML_TYPE_Q3_K:
|
|
371
|
-
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
372
|
-
break;
|
|
373
|
-
case GGML_TYPE_Q4_K:
|
|
374
|
-
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
375
|
-
break;
|
|
376
|
-
case GGML_TYPE_Q5_K:
|
|
377
|
-
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
378
|
-
break;
|
|
379
|
-
case GGML_TYPE_Q6_K:
|
|
380
|
-
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
381
|
-
break;
|
|
382
|
-
case GGML_TYPE_IQ2_XXS:
|
|
383
|
-
mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
384
|
-
break;
|
|
385
|
-
case GGML_TYPE_IQ2_XS:
|
|
386
|
-
mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
387
|
-
break;
|
|
388
|
-
case GGML_TYPE_IQ2_S:
|
|
389
|
-
mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
390
|
-
break;
|
|
391
|
-
case GGML_TYPE_IQ3_XXS:
|
|
392
|
-
mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
393
|
-
break;
|
|
394
|
-
case GGML_TYPE_IQ1_S:
|
|
395
|
-
mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
396
|
-
break;
|
|
397
|
-
case GGML_TYPE_IQ1_M:
|
|
398
|
-
mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
399
|
-
break;
|
|
400
|
-
case GGML_TYPE_IQ4_NL:
|
|
401
|
-
mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
402
|
-
break;
|
|
403
|
-
case GGML_TYPE_IQ4_XS:
|
|
404
|
-
mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
405
|
-
break;
|
|
406
|
-
case GGML_TYPE_IQ3_S:
|
|
407
|
-
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
|
408
|
-
break;
|
|
409
|
-
default:
|
|
410
|
-
GGML_ASSERT(false);
|
|
411
|
-
break;
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
GGML_UNUSED(src1);
|
|
415
|
-
GGML_UNUSED(dst);
|
|
416
|
-
GGML_UNUSED(src1_ddf_i);
|
|
417
|
-
GGML_UNUSED(src1_ncols);
|
|
418
|
-
GGML_UNUSED(src1_padded_row_size);
|
|
419
|
-
}
|