llama_cpp 0.15.4 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +15 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +13 -1
- data/vendor/tmp/llama.cpp/Makefile +62 -35
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
- data/vendor/tmp/llama.cpp/ggml.c +178 -330
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +242 -426
- data/vendor/tmp/llama.cpp/llama.h +17 -43
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
|
@@ -0,0 +1,1564 @@
|
|
|
1
|
+
#include "mmq.cuh"
|
|
2
|
+
#include "vecdotq.cuh"
|
|
3
|
+
|
|
4
|
+
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
|
5
|
+
typedef void (*load_tiles_cuda_t)(
|
|
6
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
7
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
|
|
8
|
+
typedef float (*vec_dot_q_mul_mat_cuda_t)(
|
|
9
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
10
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
|
|
11
|
+
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
|
12
|
+
typedef void (mul_mat_q_t)(
|
|
13
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
14
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst);
|
|
15
|
+
|
|
16
|
+
struct mmq_arch_config_t {
|
|
17
|
+
int x;
|
|
18
|
+
int y;
|
|
19
|
+
int nwarps;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
struct mmq_config_t {
|
|
23
|
+
mmq_arch_config_t rdna2;
|
|
24
|
+
mmq_arch_config_t rdna1;
|
|
25
|
+
mmq_arch_config_t ampere;
|
|
26
|
+
mmq_arch_config_t pascal;
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
constexpr mmq_config_t MMQ_CONFIG_Q4_0 = {
|
|
30
|
+
// x y nwarps
|
|
31
|
+
{ 64, 128, 8},
|
|
32
|
+
{ 64, 64, 8},
|
|
33
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
34
|
+
{ 4, 32, 4},
|
|
35
|
+
#else
|
|
36
|
+
{ 64, 128, 4},
|
|
37
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
38
|
+
{ 64, 64, 8},
|
|
39
|
+
};
|
|
40
|
+
constexpr mmq_config_t MMQ_CONFIG_Q4_1 = {
|
|
41
|
+
// x y nwarps
|
|
42
|
+
{ 64, 128, 8},
|
|
43
|
+
{ 64, 64, 8},
|
|
44
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
45
|
+
{ 4, 32, 4},
|
|
46
|
+
#else
|
|
47
|
+
{ 64, 128, 4},
|
|
48
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
49
|
+
{ 64, 64, 8},
|
|
50
|
+
};
|
|
51
|
+
constexpr mmq_config_t MMQ_CONFIG_Q5_0 = {
|
|
52
|
+
// x y nwarps
|
|
53
|
+
{ 64, 128, 8},
|
|
54
|
+
{ 64, 64, 8},
|
|
55
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
56
|
+
{ 4, 32, 4},
|
|
57
|
+
#else
|
|
58
|
+
{128, 64, 4},
|
|
59
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
60
|
+
{ 64, 64, 8},
|
|
61
|
+
};
|
|
62
|
+
constexpr mmq_config_t MMQ_CONFIG_Q5_1 = {
|
|
63
|
+
// x y nwarps
|
|
64
|
+
{ 64, 128, 8},
|
|
65
|
+
{ 64, 64, 8},
|
|
66
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
67
|
+
{ 4, 32, 4},
|
|
68
|
+
#else
|
|
69
|
+
{128, 64, 4},
|
|
70
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
71
|
+
{ 64, 64, 8},
|
|
72
|
+
};
|
|
73
|
+
constexpr mmq_config_t MMQ_CONFIG_Q8_0 = {
|
|
74
|
+
// x y nwarps
|
|
75
|
+
{ 64, 128, 8},
|
|
76
|
+
{ 64, 64, 8},
|
|
77
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
78
|
+
{ 4, 32, 4},
|
|
79
|
+
#else
|
|
80
|
+
{128, 64, 4},
|
|
81
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
82
|
+
{ 64, 64, 8},
|
|
83
|
+
};
|
|
84
|
+
constexpr mmq_config_t MMQ_CONFIG_Q2_K = {
|
|
85
|
+
// x y nwarps
|
|
86
|
+
{ 64, 128, 8},
|
|
87
|
+
{128, 32, 8},
|
|
88
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
89
|
+
{ 4, 32, 4},
|
|
90
|
+
#else
|
|
91
|
+
{ 64, 128, 4},
|
|
92
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
93
|
+
{ 64, 64, 8},
|
|
94
|
+
};
|
|
95
|
+
constexpr mmq_config_t MMQ_CONFIG_Q3_K = {
|
|
96
|
+
// x y nwarps
|
|
97
|
+
{128, 64, 8},
|
|
98
|
+
{ 32, 128, 8},
|
|
99
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
100
|
+
{ 4, 32, 4},
|
|
101
|
+
#else
|
|
102
|
+
{128, 128, 4},
|
|
103
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
104
|
+
{ 64, 64, 8},
|
|
105
|
+
};
|
|
106
|
+
constexpr mmq_config_t MMQ_CONFIG_Q4_K = {
|
|
107
|
+
// x y nwarps
|
|
108
|
+
{ 64, 128, 8},
|
|
109
|
+
{ 32, 64, 8},
|
|
110
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
111
|
+
{ 4, 32, 4},
|
|
112
|
+
#else
|
|
113
|
+
{ 64, 128, 4},
|
|
114
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
115
|
+
{ 64, 64, 8},
|
|
116
|
+
};
|
|
117
|
+
constexpr mmq_config_t MMQ_CONFIG_Q5_K = {
|
|
118
|
+
// x y nwarps
|
|
119
|
+
{ 64, 128, 8},
|
|
120
|
+
{ 32, 64, 8},
|
|
121
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
122
|
+
{ 4, 32, 4},
|
|
123
|
+
#else
|
|
124
|
+
{ 64, 128, 4},
|
|
125
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
126
|
+
{ 64, 64, 8},
|
|
127
|
+
};
|
|
128
|
+
constexpr mmq_config_t MMQ_CONFIG_Q6_K = {
|
|
129
|
+
// x y nwarps
|
|
130
|
+
{ 64, 128, 8},
|
|
131
|
+
{ 32, 64, 8},
|
|
132
|
+
#ifdef CUDA_USE_TENSOR_CORES
|
|
133
|
+
{ 4, 32, 4},
|
|
134
|
+
#else
|
|
135
|
+
{ 64, 64, 4},
|
|
136
|
+
#endif // CUDA_USE_TENSOR_CORES
|
|
137
|
+
{ 64, 64, 8},
|
|
138
|
+
};
|
|
139
|
+
|
|
140
|
+
// ------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
143
|
+
GGML_UNUSED(x_qh);
|
|
144
|
+
GGML_UNUSED(x_sc);
|
|
145
|
+
|
|
146
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
|
147
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
|
148
|
+
|
|
149
|
+
*x_ql = tile_x_qs;
|
|
150
|
+
*x_dm = (half2 *) tile_x_d;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
|
154
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
155
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
156
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
157
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
158
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
159
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
160
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
161
|
+
|
|
162
|
+
const int kbx = k / QI4_0;
|
|
163
|
+
const int kqsx = k % QI4_0;
|
|
164
|
+
|
|
165
|
+
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
|
166
|
+
|
|
167
|
+
float * x_dmf = (float *) x_dm;
|
|
168
|
+
|
|
169
|
+
#pragma unroll
|
|
170
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
171
|
+
int i = i0 + i_offset;
|
|
172
|
+
|
|
173
|
+
if (need_check) {
|
|
174
|
+
i = min(i, i_max);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
|
178
|
+
|
|
179
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
|
180
|
+
// x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
|
|
184
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
|
185
|
+
|
|
186
|
+
#pragma unroll
|
|
187
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
|
|
188
|
+
int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
|
189
|
+
|
|
190
|
+
if (need_check) {
|
|
191
|
+
i = min(i, i_max);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
195
|
+
|
|
196
|
+
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
|
201
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
202
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
203
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
204
|
+
|
|
205
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
|
206
|
+
const float * x_dmf = (const float *) x_dm;
|
|
207
|
+
|
|
208
|
+
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
|
209
|
+
|
|
210
|
+
#pragma unroll
|
|
211
|
+
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
|
|
212
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
|
213
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
|
|
217
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
|
|
218
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
222
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
223
|
+
|
|
224
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
|
225
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
|
226
|
+
|
|
227
|
+
*x_ql = tile_x_qs;
|
|
228
|
+
*x_dm = tile_x_dm;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
|
232
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
233
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
234
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
235
|
+
|
|
236
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
237
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
238
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
239
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
240
|
+
|
|
241
|
+
const int kbx = k / QI4_1;
|
|
242
|
+
const int kqsx = k % QI4_1;
|
|
243
|
+
|
|
244
|
+
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
|
245
|
+
|
|
246
|
+
#pragma unroll
|
|
247
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
248
|
+
int i = i0 + i_offset;
|
|
249
|
+
|
|
250
|
+
if (need_check) {
|
|
251
|
+
i = min(i, i_max);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
|
255
|
+
|
|
256
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
|
|
260
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
|
261
|
+
|
|
262
|
+
#pragma unroll
|
|
263
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
|
|
264
|
+
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
|
|
265
|
+
|
|
266
|
+
if (need_check) {
|
|
267
|
+
i = min(i, i_max);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
271
|
+
|
|
272
|
+
x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
|
277
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
278
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
279
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
280
|
+
|
|
281
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
|
282
|
+
|
|
283
|
+
int u[2*VDR_Q4_1_Q8_1_MMQ];
|
|
284
|
+
|
|
285
|
+
#pragma unroll
|
|
286
|
+
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
|
|
287
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
|
288
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
|
|
292
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
|
|
293
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
297
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
298
|
+
|
|
299
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
|
300
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
|
301
|
+
|
|
302
|
+
*x_ql = tile_x_ql;
|
|
303
|
+
*x_dm = (half2 *) tile_x_d;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
|
307
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
308
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
309
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
310
|
+
|
|
311
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
312
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
313
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
314
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
315
|
+
|
|
316
|
+
const int kbx = k / QI5_0;
|
|
317
|
+
const int kqsx = k % QI5_0;
|
|
318
|
+
|
|
319
|
+
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
|
320
|
+
|
|
321
|
+
#pragma unroll
|
|
322
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
323
|
+
int i = i0 + i_offset;
|
|
324
|
+
|
|
325
|
+
if (need_check) {
|
|
326
|
+
i = min(i, i_max);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
|
330
|
+
|
|
331
|
+
const int ql = get_int_from_uint8(bxi->qs, kqsx);
|
|
332
|
+
const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
|
|
333
|
+
|
|
334
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
|
335
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
|
336
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
|
337
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
|
338
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
|
339
|
+
qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
|
|
340
|
+
|
|
341
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
|
342
|
+
|
|
343
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
|
344
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
|
345
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
|
346
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
|
347
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
|
348
|
+
qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
|
|
349
|
+
|
|
350
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
|
|
354
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
|
355
|
+
float * x_dmf = (float *) x_dm;
|
|
356
|
+
|
|
357
|
+
#pragma unroll
|
|
358
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
|
|
359
|
+
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
|
|
360
|
+
|
|
361
|
+
if (need_check) {
|
|
362
|
+
i = min(i, i_max);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
366
|
+
|
|
367
|
+
x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
|
372
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
373
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
374
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
375
|
+
|
|
376
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
|
377
|
+
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
|
378
|
+
const float * x_dmf = (const float *) x_dm;
|
|
379
|
+
const float * y_df = (const float *) y_ds;
|
|
380
|
+
|
|
381
|
+
int u[2*VDR_Q5_0_Q8_1_MMQ];
|
|
382
|
+
|
|
383
|
+
#pragma unroll
|
|
384
|
+
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
|
|
385
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
|
386
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
return vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
|
|
390
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
395
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
396
|
+
|
|
397
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
|
398
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
|
399
|
+
|
|
400
|
+
*x_ql = tile_x_ql;
|
|
401
|
+
*x_dm = tile_x_dm;
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
|
405
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
406
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
407
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
408
|
+
|
|
409
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
410
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
411
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
412
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
413
|
+
|
|
414
|
+
const int kbx = k / QI5_1;
|
|
415
|
+
const int kqsx = k % QI5_1;
|
|
416
|
+
|
|
417
|
+
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
|
418
|
+
|
|
419
|
+
#pragma unroll
|
|
420
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
421
|
+
int i = i0 + i_offset;
|
|
422
|
+
|
|
423
|
+
if (need_check) {
|
|
424
|
+
i = min(i, i_max);
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
|
|
428
|
+
|
|
429
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
|
430
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
|
|
431
|
+
|
|
432
|
+
int qs0 = (ql >> 0) & 0x0F0F0F0F;
|
|
433
|
+
qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
|
|
434
|
+
qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
|
|
435
|
+
qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
|
|
436
|
+
qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
|
|
437
|
+
|
|
438
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
|
|
439
|
+
|
|
440
|
+
int qs1 = (ql >> 4) & 0x0F0F0F0F;
|
|
441
|
+
qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
|
|
442
|
+
qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
|
|
443
|
+
qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
|
|
444
|
+
qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
|
|
445
|
+
|
|
446
|
+
x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
|
|
450
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
|
451
|
+
|
|
452
|
+
#pragma unroll
|
|
453
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
|
|
454
|
+
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
|
|
455
|
+
|
|
456
|
+
if (need_check) {
|
|
457
|
+
i = min(i, i_max);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
461
|
+
|
|
462
|
+
x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
|
467
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
468
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
469
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
470
|
+
|
|
471
|
+
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
|
472
|
+
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
|
473
|
+
|
|
474
|
+
int u[2*VDR_Q5_1_Q8_1_MMQ];
|
|
475
|
+
|
|
476
|
+
#pragma unroll
|
|
477
|
+
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
|
|
478
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
|
479
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
|
|
483
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
487
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
488
|
+
|
|
489
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
|
490
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
|
491
|
+
|
|
492
|
+
*x_ql = tile_x_qs;
|
|
493
|
+
*x_dm = (half2 *) tile_x_d;
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
|
497
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
498
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
499
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
500
|
+
|
|
501
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
502
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
503
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
504
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
505
|
+
|
|
506
|
+
const int kbx = k / QI8_0;
|
|
507
|
+
const int kqsx = k % QI8_0;
|
|
508
|
+
float * x_dmf = (float *) x_dm;
|
|
509
|
+
|
|
510
|
+
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
|
511
|
+
|
|
512
|
+
#pragma unroll
|
|
513
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
514
|
+
int i = i0 + i_offset;
|
|
515
|
+
|
|
516
|
+
if (need_check) {
|
|
517
|
+
i = min(i, i_max);
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
|
521
|
+
|
|
522
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
|
|
526
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
|
527
|
+
|
|
528
|
+
#pragma unroll
|
|
529
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
|
|
530
|
+
int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
|
531
|
+
|
|
532
|
+
if (need_check) {
|
|
533
|
+
i = min(i, i_max);
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
537
|
+
|
|
538
|
+
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
|
543
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
544
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
545
|
+
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
|
|
546
|
+
|
|
547
|
+
const float * x_dmf = (const float *) x_dm;
|
|
548
|
+
const float * y_df = (const float *) y_ds;
|
|
549
|
+
|
|
550
|
+
return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
|
|
551
|
+
(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
|
|
552
|
+
y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
556
|
+
GGML_UNUSED(x_qh);
|
|
557
|
+
|
|
558
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
|
559
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
|
560
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
|
561
|
+
|
|
562
|
+
*x_ql = tile_x_ql;
|
|
563
|
+
*x_dm = tile_x_dm;
|
|
564
|
+
*x_sc = tile_x_sc;
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
|
568
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
569
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
570
|
+
GGML_UNUSED(x_qh);
|
|
571
|
+
|
|
572
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
573
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
574
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
575
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
576
|
+
|
|
577
|
+
const int kbx = k / QI2_K;
|
|
578
|
+
const int kqsx = k % QI2_K;
|
|
579
|
+
|
|
580
|
+
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
|
581
|
+
|
|
582
|
+
#pragma unroll
|
|
583
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
584
|
+
int i = i0 + i_offset;
|
|
585
|
+
|
|
586
|
+
if (need_check) {
|
|
587
|
+
i = min(i, i_max);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
|
|
591
|
+
|
|
592
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
|
|
596
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
|
597
|
+
|
|
598
|
+
#pragma unroll
|
|
599
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
|
|
600
|
+
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
|
|
601
|
+
|
|
602
|
+
if (need_check) {
|
|
603
|
+
i = min(i, i_max);
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
607
|
+
|
|
608
|
+
x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
#pragma unroll
|
|
612
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
|
613
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
|
614
|
+
|
|
615
|
+
if (need_check) {
|
|
616
|
+
i = min(i, i_max);
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
|
|
620
|
+
|
|
621
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
|
|
622
|
+
}
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
|
626
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
627
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
628
|
+
GGML_UNUSED(x_qh);
|
|
629
|
+
|
|
630
|
+
const int kbx = k / QI2_K;
|
|
631
|
+
const int ky = (k % QI2_K) * QR2_K;
|
|
632
|
+
const float * y_df = (const float *) y_ds;
|
|
633
|
+
|
|
634
|
+
int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
|
|
635
|
+
|
|
636
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
|
|
637
|
+
const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
|
|
638
|
+
|
|
639
|
+
#pragma unroll
|
|
640
|
+
for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
|
|
641
|
+
v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
|
|
645
|
+
|
|
646
|
+
const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
|
|
647
|
+
return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
651
|
+
|
|
652
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
|
653
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
|
|
654
|
+
__shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
|
|
655
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
|
656
|
+
|
|
657
|
+
*x_ql = tile_x_ql;
|
|
658
|
+
*x_dm = tile_x_dm;
|
|
659
|
+
*x_qh = tile_x_qh;
|
|
660
|
+
*x_sc = tile_x_sc;
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
|
664
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
665
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
666
|
+
|
|
667
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
668
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
669
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
670
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
671
|
+
|
|
672
|
+
const int kbx = k / QI3_K;
|
|
673
|
+
const int kqsx = k % QI3_K;
|
|
674
|
+
|
|
675
|
+
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
|
676
|
+
|
|
677
|
+
#pragma unroll
|
|
678
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
679
|
+
int i = i0 + i_offset;
|
|
680
|
+
|
|
681
|
+
if (need_check) {
|
|
682
|
+
i = min(i, i_max);
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
|
|
686
|
+
|
|
687
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
|
|
691
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
|
692
|
+
float * x_dmf = (float *) x_dm;
|
|
693
|
+
|
|
694
|
+
#pragma unroll
|
|
695
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
|
|
696
|
+
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
|
|
697
|
+
|
|
698
|
+
if (need_check) {
|
|
699
|
+
i = min(i, i_max);
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
703
|
+
|
|
704
|
+
x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
#pragma unroll
|
|
708
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
|
|
709
|
+
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
|
710
|
+
|
|
711
|
+
if (need_check) {
|
|
712
|
+
i = min(i, i_max);
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
|
|
716
|
+
|
|
717
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
|
718
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
#pragma unroll
|
|
722
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
|
723
|
+
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
|
724
|
+
|
|
725
|
+
if (need_check) {
|
|
726
|
+
i = min(i, i_max);
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
|
|
730
|
+
|
|
731
|
+
const int ksc = k % (QI3_K/4);
|
|
732
|
+
|
|
733
|
+
const int ksc_low = ksc % (QI3_K/8);
|
|
734
|
+
const int shift_low = 4 * (ksc / (QI3_K/8));
|
|
735
|
+
const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
|
|
736
|
+
|
|
737
|
+
const int ksc_high = QI3_K/8;
|
|
738
|
+
const int shift_high = 2 * ksc;
|
|
739
|
+
const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
|
|
740
|
+
|
|
741
|
+
const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
|
|
742
|
+
|
|
743
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
748
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
749
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
750
|
+
|
|
751
|
+
const int kbx = k / QI3_K;
|
|
752
|
+
const int ky = (k % QI3_K) * QR3_K;
|
|
753
|
+
const float * x_dmf = (const float *) x_dm;
|
|
754
|
+
const float * y_df = (const float *) y_ds;
|
|
755
|
+
|
|
756
|
+
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
|
757
|
+
|
|
758
|
+
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
|
759
|
+
|
|
760
|
+
#pragma unroll
|
|
761
|
+
for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
|
|
762
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
|
|
763
|
+
const int shift = 2 * ((ky % 32) / 8);
|
|
764
|
+
const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
|
765
|
+
|
|
766
|
+
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
|
|
767
|
+
const int vlh = (vh << 2) & 0x04040404;
|
|
768
|
+
|
|
769
|
+
v[l] = __vsubss4(vll, vlh);
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
|
|
773
|
+
return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
777
|
+
GGML_UNUSED(x_qh);
|
|
778
|
+
|
|
779
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
|
780
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
|
781
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
|
782
|
+
|
|
783
|
+
*x_ql = tile_x_ql;
|
|
784
|
+
*x_dm = tile_x_dm;
|
|
785
|
+
*x_sc = tile_x_sc;
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
|
789
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
790
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
791
|
+
GGML_UNUSED(x_qh);
|
|
792
|
+
|
|
793
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
794
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
795
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
796
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
797
|
+
|
|
798
|
+
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
|
799
|
+
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
|
800
|
+
|
|
801
|
+
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
|
802
|
+
|
|
803
|
+
#pragma unroll
|
|
804
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
805
|
+
int i = i0 + i_offset;
|
|
806
|
+
|
|
807
|
+
if (need_check) {
|
|
808
|
+
i = min(i, i_max);
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
|
|
812
|
+
|
|
813
|
+
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
|
817
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
|
818
|
+
|
|
819
|
+
#pragma unroll
|
|
820
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
|
|
821
|
+
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
|
|
822
|
+
|
|
823
|
+
if (need_check) {
|
|
824
|
+
i = min(i, i_max);
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
828
|
+
|
|
829
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
#pragma unroll
|
|
833
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
|
834
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
|
835
|
+
|
|
836
|
+
if (need_check) {
|
|
837
|
+
i = min(i, i_max);
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
|
841
|
+
|
|
842
|
+
const int * scales = (const int *) bxi->scales;
|
|
843
|
+
|
|
844
|
+
const int ksc = k % (WARP_SIZE/8);
|
|
845
|
+
|
|
846
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
|
847
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
|
848
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
|
849
|
+
|
|
850
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
|
855
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
856
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
857
|
+
GGML_UNUSED(x_qh);
|
|
858
|
+
|
|
859
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
|
860
|
+
|
|
861
|
+
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
|
862
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
|
|
863
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
867
|
+
GGML_UNUSED(x_qh);
|
|
868
|
+
|
|
869
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
|
870
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
|
871
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
|
872
|
+
|
|
873
|
+
*x_ql = tile_x_ql;
|
|
874
|
+
*x_dm = tile_x_dm;
|
|
875
|
+
*x_sc = tile_x_sc;
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
|
879
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
880
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
881
|
+
GGML_UNUSED(x_qh);
|
|
882
|
+
|
|
883
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
884
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
885
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
886
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
887
|
+
|
|
888
|
+
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
|
889
|
+
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
|
890
|
+
|
|
891
|
+
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
|
892
|
+
|
|
893
|
+
#pragma unroll
|
|
894
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
895
|
+
int i = i0 + i_offset;
|
|
896
|
+
|
|
897
|
+
if (need_check) {
|
|
898
|
+
i = min(i, i_max);
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
|
|
902
|
+
const int ky = QR5_K*kqsx;
|
|
903
|
+
|
|
904
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
|
905
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
|
906
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
|
907
|
+
|
|
908
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
|
|
909
|
+
const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
|
|
910
|
+
const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
|
|
911
|
+
|
|
912
|
+
const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
|
|
913
|
+
const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
|
|
914
|
+
|
|
915
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
|
|
916
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
|
920
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
|
921
|
+
|
|
922
|
+
#pragma unroll
|
|
923
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
|
|
924
|
+
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
|
|
925
|
+
|
|
926
|
+
if (need_check) {
|
|
927
|
+
i = min(i, i_max);
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
931
|
+
|
|
932
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
|
933
|
+
}
|
|
934
|
+
|
|
935
|
+
#pragma unroll
|
|
936
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
|
937
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
|
938
|
+
|
|
939
|
+
if (need_check) {
|
|
940
|
+
i = min(i, i_max);
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
|
944
|
+
|
|
945
|
+
const int * scales = (const int *) bxi->scales;
|
|
946
|
+
|
|
947
|
+
const int ksc = k % (WARP_SIZE/8);
|
|
948
|
+
|
|
949
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
|
950
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
|
951
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
|
952
|
+
|
|
953
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
|
958
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
959
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
960
|
+
GGML_UNUSED(x_qh);
|
|
961
|
+
|
|
962
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
|
963
|
+
|
|
964
|
+
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
|
965
|
+
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
|
966
|
+
return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
|
|
967
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
|
971
|
+
GGML_UNUSED(x_qh);
|
|
972
|
+
|
|
973
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
|
974
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
|
975
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
|
976
|
+
|
|
977
|
+
*x_ql = tile_x_ql;
|
|
978
|
+
*x_dm = tile_x_dm;
|
|
979
|
+
*x_sc = tile_x_sc;
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
|
983
|
+
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
|
984
|
+
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
|
985
|
+
GGML_UNUSED(x_qh);
|
|
986
|
+
|
|
987
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
|
988
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
|
989
|
+
GGML_CUDA_ASSUME(k >= 0);
|
|
990
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
|
991
|
+
|
|
992
|
+
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
|
993
|
+
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
|
994
|
+
|
|
995
|
+
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
|
996
|
+
|
|
997
|
+
#pragma unroll
|
|
998
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
|
999
|
+
int i = i0 + i_offset;
|
|
1000
|
+
|
|
1001
|
+
if (need_check) {
|
|
1002
|
+
i = min(i, i_max);
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
|
|
1006
|
+
const int ky = QR6_K*kqsx;
|
|
1007
|
+
|
|
1008
|
+
const int ql = get_int_from_uint8(bxi->ql, kqsx);
|
|
1009
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
|
1010
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
|
1011
|
+
|
|
1012
|
+
const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
|
|
1013
|
+
const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
|
|
1014
|
+
const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
|
|
1015
|
+
|
|
1016
|
+
const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
|
|
1017
|
+
const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
|
|
1018
|
+
|
|
1019
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
|
|
1020
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
|
1024
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
|
1025
|
+
float * x_dmf = (float *) x_dm;
|
|
1026
|
+
|
|
1027
|
+
#pragma unroll
|
|
1028
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
|
|
1029
|
+
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
|
|
1030
|
+
|
|
1031
|
+
if (need_check) {
|
|
1032
|
+
i = min(i, i_max);
|
|
1033
|
+
}
|
|
1034
|
+
|
|
1035
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
|
1036
|
+
|
|
1037
|
+
x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
#pragma unroll
|
|
1041
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
|
1042
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
|
1043
|
+
|
|
1044
|
+
if (need_check) {
|
|
1045
|
+
i = min(i, i_max);
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
|
|
1049
|
+
|
|
1050
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
1055
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
|
1056
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
|
1057
|
+
GGML_UNUSED(x_qh);
|
|
1058
|
+
|
|
1059
|
+
const float * x_dmf = (const float *) x_dm;
|
|
1060
|
+
const float * y_df = (const float *) y_ds;
|
|
1061
|
+
|
|
1062
|
+
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
|
|
1063
|
+
|
|
1064
|
+
const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
|
|
1065
|
+
const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
|
|
1066
|
+
return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
|
1070
|
+
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
|
1071
|
+
static __device__ __forceinline__ void mul_mat_q(
|
|
1072
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1073
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1074
|
+
|
|
1075
|
+
const block_q_t * x = (const block_q_t *) vx;
|
|
1076
|
+
const block_q8_1 * y = (const block_q8_1 *) vy;
|
|
1077
|
+
|
|
1078
|
+
const int blocks_per_row_x = ncols_x / qk;
|
|
1079
|
+
const int blocks_per_col_y = nrows_y / QK8_1;
|
|
1080
|
+
const int blocks_per_warp = WARP_SIZE / qi;
|
|
1081
|
+
|
|
1082
|
+
const int & ncols_dst = ncols_y;
|
|
1083
|
+
|
|
1084
|
+
const int row_dst_0 = blockIdx.x*mmq_y;
|
|
1085
|
+
const int & row_x_0 = row_dst_0;
|
|
1086
|
+
|
|
1087
|
+
const int col_dst_0 = blockIdx.y*mmq_x;
|
|
1088
|
+
const int & col_y_0 = col_dst_0;
|
|
1089
|
+
|
|
1090
|
+
int * tile_x_ql = nullptr;
|
|
1091
|
+
half2 * tile_x_dm = nullptr;
|
|
1092
|
+
int * tile_x_qh = nullptr;
|
|
1093
|
+
int * tile_x_sc = nullptr;
|
|
1094
|
+
|
|
1095
|
+
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
|
|
1096
|
+
|
|
1097
|
+
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
|
1098
|
+
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
|
1099
|
+
|
|
1100
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
|
1101
|
+
|
|
1102
|
+
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
|
1103
|
+
|
|
1104
|
+
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
|
|
1105
|
+
threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
|
|
1106
|
+
|
|
1107
|
+
#pragma unroll
|
|
1108
|
+
for (int ir = 0; ir < qr; ++ir) {
|
|
1109
|
+
const int kqs = ir*WARP_SIZE + threadIdx.x;
|
|
1110
|
+
const int kbxd = kqs / QI8_1;
|
|
1111
|
+
|
|
1112
|
+
#pragma unroll
|
|
1113
|
+
for (int i = 0; i < mmq_x; i += nwarps) {
|
|
1114
|
+
const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
|
|
1115
|
+
|
|
1116
|
+
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
|
|
1117
|
+
|
|
1118
|
+
const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
|
|
1119
|
+
tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
#pragma unroll
|
|
1123
|
+
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
|
|
1124
|
+
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
|
|
1125
|
+
const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
|
|
1126
|
+
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
|
1127
|
+
|
|
1128
|
+
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
|
|
1129
|
+
const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
|
|
1130
|
+
half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
|
|
1131
|
+
if (need_sum) {
|
|
1132
|
+
*dsi_dst = *dsi_src;
|
|
1133
|
+
} else {
|
|
1134
|
+
float * dfi_dst = (float *) dsi_dst;
|
|
1135
|
+
*dfi_dst = __low2float(*dsi_src);
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
__syncthreads();
|
|
1140
|
+
|
|
1141
|
+
// #pragma unroll // unrolling this loop causes too much register pressure
|
|
1142
|
+
for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
|
|
1143
|
+
#pragma unroll
|
|
1144
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
|
1145
|
+
#pragma unroll
|
|
1146
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
|
1147
|
+
sum[i/WARP_SIZE][j/nwarps] += vec_dot(
|
|
1148
|
+
tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
|
|
1149
|
+
threadIdx.x + i, threadIdx.y + j, k);
|
|
1150
|
+
}
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
__syncthreads();
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
#pragma unroll
|
|
1159
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
|
1160
|
+
const int col_dst = col_dst_0 + j + threadIdx.y;
|
|
1161
|
+
|
|
1162
|
+
if (col_dst >= ncols_dst) {
|
|
1163
|
+
return;
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
#pragma unroll
|
|
1167
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
|
1168
|
+
const int row_dst = row_dst_0 + threadIdx.x + i;
|
|
1169
|
+
|
|
1170
|
+
if (row_dst >= nrows_dst) {
|
|
1171
|
+
continue;
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
static constexpr __device__ mmq_arch_config_t get_arch_config_device(mmq_config_t mmq_config) {
|
|
1180
|
+
|
|
1181
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1182
|
+
|
|
1183
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1184
|
+
return mmq_config.rdna2;
|
|
1185
|
+
#else
|
|
1186
|
+
return mmq_config.rdna1;
|
|
1187
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1188
|
+
|
|
1189
|
+
#else
|
|
1190
|
+
|
|
1191
|
+
#if __CUDA_ARCH__ >= CC_VOLTA
|
|
1192
|
+
return mmq_config.ampere;
|
|
1193
|
+
#else
|
|
1194
|
+
return mmq_config.pascal;
|
|
1195
|
+
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
|
1196
|
+
|
|
1197
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
template <bool need_check> static __global__ void
|
|
1201
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1202
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1203
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2)
|
|
1204
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1205
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1206
|
+
mul_mat_q4_0(
|
|
1207
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1208
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1209
|
+
|
|
1210
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1211
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_0);
|
|
1212
|
+
|
|
1213
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_0<arch_config.y>,
|
|
1214
|
+
load_tiles_q4_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
|
1215
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1216
|
+
#else
|
|
1217
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1218
|
+
GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
|
|
1219
|
+
NO_DEVICE_CODE;
|
|
1220
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
template <bool need_check> static __global__ void
|
|
1224
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1225
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1226
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.rdna2.nwarps, 2)
|
|
1227
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1228
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
|
1229
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.pascal.nwarps, 2)
|
|
1230
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
|
1231
|
+
mul_mat_q4_1(
|
|
1232
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1233
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1234
|
+
|
|
1235
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1236
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_1);
|
|
1237
|
+
|
|
1238
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_1<arch_config.y>,
|
|
1239
|
+
load_tiles_q4_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
|
1240
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1241
|
+
#else
|
|
1242
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1243
|
+
GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
|
|
1244
|
+
NO_DEVICE_CODE;
|
|
1245
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
template <bool need_check> static __global__ void
|
|
1249
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1250
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1251
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_0.rdna2.nwarps, 2)
|
|
1252
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1253
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1254
|
+
mul_mat_q5_0(
|
|
1255
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1256
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1257
|
+
|
|
1258
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1259
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_0);
|
|
1260
|
+
|
|
1261
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_0<arch_config.y>,
|
|
1262
|
+
load_tiles_q5_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
|
1263
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1264
|
+
#else
|
|
1265
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1266
|
+
GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
|
|
1267
|
+
NO_DEVICE_CODE;
|
|
1268
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
template <bool need_check> static __global__ void
|
|
1272
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1273
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1274
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_1.rdna2.nwarps, 2)
|
|
1275
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1276
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1277
|
+
mul_mat_q5_1(
|
|
1278
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1279
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1280
|
+
|
|
1281
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1282
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_1);
|
|
1283
|
+
|
|
1284
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_1<arch_config.y>,
|
|
1285
|
+
load_tiles_q5_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
|
1286
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1287
|
+
#else
|
|
1288
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1289
|
+
GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
|
|
1290
|
+
NO_DEVICE_CODE;
|
|
1291
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
template <bool need_check> static __global__ void
|
|
1295
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1296
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1297
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q8_0.rdna2.nwarps, 2)
|
|
1298
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1299
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1300
|
+
mul_mat_q8_0(
|
|
1301
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1302
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1303
|
+
|
|
1304
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1305
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0);
|
|
1306
|
+
|
|
1307
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q8_0<arch_config.y>,
|
|
1308
|
+
load_tiles_q8_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
|
1309
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1310
|
+
#else
|
|
1311
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1312
|
+
GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
|
|
1313
|
+
NO_DEVICE_CODE;
|
|
1314
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
template <bool need_check> static __global__ void
|
|
1318
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1319
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1320
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q2_K.rdna2.nwarps, 2)
|
|
1321
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1322
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1323
|
+
mul_mat_q2_K(
|
|
1324
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1325
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1326
|
+
|
|
1327
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1328
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q2_K);
|
|
1329
|
+
|
|
1330
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q2_K<arch_config.y>,
|
|
1331
|
+
load_tiles_q2_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
|
1332
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1333
|
+
#else
|
|
1334
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1335
|
+
GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
|
|
1336
|
+
NO_DEVICE_CODE;
|
|
1337
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
template <bool need_check> static __global__ void
|
|
1341
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1342
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1343
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.rdna2.nwarps, 2)
|
|
1344
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1345
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
|
1346
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.pascal.nwarps, 2)
|
|
1347
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
|
1348
|
+
mul_mat_q3_K(
|
|
1349
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1350
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1351
|
+
|
|
1352
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1353
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q3_K);
|
|
1354
|
+
|
|
1355
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q3_K<arch_config.y>,
|
|
1356
|
+
load_tiles_q3_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
|
1357
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1358
|
+
#else
|
|
1359
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1360
|
+
GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
|
|
1361
|
+
NO_DEVICE_CODE;
|
|
1362
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
template <bool need_check> static __global__ void
|
|
1366
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1367
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1368
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.rdna2.nwarps, 2)
|
|
1369
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1370
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
|
1371
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
|
|
1372
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
|
1373
|
+
mul_mat_q4_K(
|
|
1374
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1375
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1376
|
+
|
|
1377
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1378
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_K);
|
|
1379
|
+
|
|
1380
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_K<arch_config.y>,
|
|
1381
|
+
load_tiles_q4_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
|
1382
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1383
|
+
#else
|
|
1384
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1385
|
+
GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
|
|
1386
|
+
NO_DEVICE_CODE;
|
|
1387
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
template <bool need_check> static __global__ void
|
|
1391
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1392
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1393
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_K.rdna2.nwarps, 2)
|
|
1394
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1395
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1396
|
+
mul_mat_q5_K(
|
|
1397
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1398
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1399
|
+
|
|
1400
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1401
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_K);
|
|
1402
|
+
|
|
1403
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_K<arch_config.y>,
|
|
1404
|
+
load_tiles_q5_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
|
1405
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1406
|
+
#else
|
|
1407
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1408
|
+
GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
|
|
1409
|
+
NO_DEVICE_CODE;
|
|
1410
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
template <bool need_check> static __global__ void
|
|
1414
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
1415
|
+
#if defined(RDNA3) || defined(RDNA2)
|
|
1416
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q6_K.rdna2.nwarps, 2)
|
|
1417
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
|
1418
|
+
#elif __CUDA_ARCH__ < CC_VOLTA
|
|
1419
|
+
__launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
|
|
1420
|
+
#endif // __CUDA_ARCH__ < CC_VOLTA
|
|
1421
|
+
mul_mat_q6_K(
|
|
1422
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
|
1423
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
|
1424
|
+
|
|
1425
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1426
|
+
constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q6_K);
|
|
1427
|
+
|
|
1428
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q6_K<arch_config.y>,
|
|
1429
|
+
load_tiles_q6_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
|
1430
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
|
1431
|
+
#else
|
|
1432
|
+
GGML_UNUSED(get_arch_config_device);
|
|
1433
|
+
GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
|
|
1434
|
+
NO_DEVICE_CODE;
|
|
1435
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
#define MMQ_SWITCH_CASE(type_suffix) \
|
|
1439
|
+
case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) { \
|
|
1440
|
+
const bool need_check = false; \
|
|
1441
|
+
mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
|
|
1442
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
|
|
1443
|
+
} else { \
|
|
1444
|
+
const bool need_check = true; \
|
|
1445
|
+
mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>> \
|
|
1446
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
|
|
1447
|
+
} break; \
|
|
1448
|
+
|
|
1449
|
+
void ggml_cuda_op_mul_mat_q(
|
|
1450
|
+
ggml_backend_cuda_context & ctx,
|
|
1451
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
1452
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
1453
|
+
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
|
1454
|
+
|
|
1455
|
+
const int64_t ne00 = src0->ne[0];
|
|
1456
|
+
|
|
1457
|
+
const int64_t ne10 = src1->ne[0];
|
|
1458
|
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
|
1459
|
+
|
|
1460
|
+
const int64_t ne0 = dst->ne[0];
|
|
1461
|
+
|
|
1462
|
+
const int64_t row_diff = row_high - row_low;
|
|
1463
|
+
|
|
1464
|
+
int id = ggml_cuda_get_device();
|
|
1465
|
+
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
|
1466
|
+
|
|
1467
|
+
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
1468
|
+
// nrows_dst == nrows of the matrix that the kernel writes into
|
|
1469
|
+
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
|
1470
|
+
|
|
1471
|
+
mmq_config_t mmq_config;
|
|
1472
|
+
|
|
1473
|
+
switch (src0->type) {
|
|
1474
|
+
case GGML_TYPE_Q4_0:
|
|
1475
|
+
mmq_config = MMQ_CONFIG_Q4_0;
|
|
1476
|
+
break;
|
|
1477
|
+
case GGML_TYPE_Q4_1:
|
|
1478
|
+
mmq_config = MMQ_CONFIG_Q4_1;
|
|
1479
|
+
break;
|
|
1480
|
+
case GGML_TYPE_Q5_0:
|
|
1481
|
+
mmq_config = MMQ_CONFIG_Q5_0;
|
|
1482
|
+
break;
|
|
1483
|
+
case GGML_TYPE_Q5_1:
|
|
1484
|
+
mmq_config = MMQ_CONFIG_Q5_1;
|
|
1485
|
+
break;
|
|
1486
|
+
case GGML_TYPE_Q8_0:
|
|
1487
|
+
mmq_config = MMQ_CONFIG_Q8_0;
|
|
1488
|
+
break;
|
|
1489
|
+
case GGML_TYPE_Q2_K:
|
|
1490
|
+
mmq_config = MMQ_CONFIG_Q2_K;
|
|
1491
|
+
break;
|
|
1492
|
+
case GGML_TYPE_Q3_K:
|
|
1493
|
+
mmq_config = MMQ_CONFIG_Q3_K;
|
|
1494
|
+
break;
|
|
1495
|
+
case GGML_TYPE_Q4_K:
|
|
1496
|
+
mmq_config = MMQ_CONFIG_Q4_K;
|
|
1497
|
+
break;
|
|
1498
|
+
case GGML_TYPE_Q5_K:
|
|
1499
|
+
mmq_config = MMQ_CONFIG_Q5_K;
|
|
1500
|
+
break;
|
|
1501
|
+
case GGML_TYPE_Q6_K:
|
|
1502
|
+
mmq_config = MMQ_CONFIG_Q6_K;
|
|
1503
|
+
break;
|
|
1504
|
+
default:
|
|
1505
|
+
GGML_ASSERT(false);
|
|
1506
|
+
break;
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
mmq_arch_config_t arch_config;
|
|
1510
|
+
if (compute_capability >= CC_RDNA2) {
|
|
1511
|
+
arch_config = mmq_config.rdna2;
|
|
1512
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
|
1513
|
+
arch_config = mmq_config.rdna1;
|
|
1514
|
+
} else if (compute_capability >= CC_VOLTA) {
|
|
1515
|
+
arch_config = mmq_config.ampere;
|
|
1516
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
|
1517
|
+
arch_config = mmq_config.pascal;
|
|
1518
|
+
} else {
|
|
1519
|
+
GGML_ASSERT(false);
|
|
1520
|
+
}
|
|
1521
|
+
|
|
1522
|
+
const int block_num_x = (row_diff + arch_config.y - 1) / arch_config.y;
|
|
1523
|
+
const int block_num_y = (src1_ncols + arch_config.x - 1) / arch_config.x;
|
|
1524
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
|
1525
|
+
const dim3 block_dims(WARP_SIZE, arch_config.nwarps, 1);
|
|
1526
|
+
|
|
1527
|
+
switch (src0->type) {
|
|
1528
|
+
MMQ_SWITCH_CASE(4_0)
|
|
1529
|
+
MMQ_SWITCH_CASE(4_1)
|
|
1530
|
+
MMQ_SWITCH_CASE(5_0)
|
|
1531
|
+
MMQ_SWITCH_CASE(5_1)
|
|
1532
|
+
MMQ_SWITCH_CASE(8_0)
|
|
1533
|
+
MMQ_SWITCH_CASE(2_K)
|
|
1534
|
+
MMQ_SWITCH_CASE(3_K)
|
|
1535
|
+
MMQ_SWITCH_CASE(4_K)
|
|
1536
|
+
MMQ_SWITCH_CASE(5_K)
|
|
1537
|
+
MMQ_SWITCH_CASE(6_K)
|
|
1538
|
+
default:
|
|
1539
|
+
GGML_ASSERT(false);
|
|
1540
|
+
break;
|
|
1541
|
+
}
|
|
1542
|
+
|
|
1543
|
+
GGML_UNUSED(src1);
|
|
1544
|
+
GGML_UNUSED(dst);
|
|
1545
|
+
GGML_UNUSED(src1_ddf_i);
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1548
|
+
bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
|
1549
|
+
switch (type) {
|
|
1550
|
+
case GGML_TYPE_Q4_0:
|
|
1551
|
+
case GGML_TYPE_Q4_1:
|
|
1552
|
+
case GGML_TYPE_Q5_0:
|
|
1553
|
+
case GGML_TYPE_Q5_1:
|
|
1554
|
+
case GGML_TYPE_Q8_0:
|
|
1555
|
+
case GGML_TYPE_Q2_K:
|
|
1556
|
+
case GGML_TYPE_Q3_K:
|
|
1557
|
+
case GGML_TYPE_Q4_K:
|
|
1558
|
+
case GGML_TYPE_Q5_K:
|
|
1559
|
+
case GGML_TYPE_Q6_K:
|
|
1560
|
+
return true;
|
|
1561
|
+
default:
|
|
1562
|
+
return false;
|
|
1563
|
+
}
|
|
1564
|
+
}
|