llama_cpp 0.16.2 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +7 -12
- data/ext/llama_cpp/extconf.rb +2 -43
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/lib/llama_cpp/version.rb +3 -3
- data/sig/llama_cpp.rbs +3 -0
- metadata +2 -171
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/LICENSE +0 -21
- data/vendor/tmp/llama.cpp/Makefile +0 -1124
- data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
- data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
- data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
- data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
- data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
- data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
- data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
- data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
- data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
- data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
- data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
- data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
- data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
- data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
- data/vendor/tmp/llama.cpp/ggml.c +0 -22506
- data/vendor/tmp/llama.cpp/ggml.h +0 -2458
- data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
- data/vendor/tmp/llama.cpp/llama.h +0 -1147
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
- data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
- data/vendor/tmp/llama.cpp/sgemm.h +0 -14
- data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
- data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
- data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
- data/vendor/tmp/llama.cpp/unicode.h +0 -63
|
@@ -1,674 +0,0 @@
|
|
|
1
|
-
#include "dmmv.cuh"
|
|
2
|
-
#include "dequantize.cuh"
|
|
3
|
-
#include "convert.cuh"
|
|
4
|
-
|
|
5
|
-
#ifndef K_QUANTS_PER_ITERATION
|
|
6
|
-
#define K_QUANTS_PER_ITERATION 2
|
|
7
|
-
#else
|
|
8
|
-
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
|
9
|
-
#endif
|
|
10
|
-
|
|
11
|
-
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
|
12
|
-
|
|
13
|
-
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
|
14
|
-
|
|
15
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
|
16
|
-
if (row > nrows) return;
|
|
17
|
-
|
|
18
|
-
const int num_blocks_per_row = ncols / QK_K;
|
|
19
|
-
const int ib0 = row*num_blocks_per_row;
|
|
20
|
-
|
|
21
|
-
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
|
22
|
-
|
|
23
|
-
float tmp = 0; // partial sum for thread in warp
|
|
24
|
-
|
|
25
|
-
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
|
26
|
-
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
|
27
|
-
|
|
28
|
-
const int step = 16/K_QUANTS_PER_ITERATION;
|
|
29
|
-
|
|
30
|
-
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
|
31
|
-
const int in = tid - step*im; // 0...15 or 0...7
|
|
32
|
-
|
|
33
|
-
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
|
34
|
-
const int q_offset = 32*im + l0;
|
|
35
|
-
const int s_offset = 8*im;
|
|
36
|
-
const int y_offset = 128*im + l0;
|
|
37
|
-
|
|
38
|
-
uint32_t aux[4];
|
|
39
|
-
const uint8_t * d = (const uint8_t *)aux;
|
|
40
|
-
const uint8_t * m = (const uint8_t *)(aux + 2);
|
|
41
|
-
|
|
42
|
-
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
|
43
|
-
|
|
44
|
-
const float * y = yy + i * QK_K + y_offset;
|
|
45
|
-
const uint8_t * q = x[i].qs + q_offset;
|
|
46
|
-
|
|
47
|
-
const float dall = __low2half(x[i].dm);
|
|
48
|
-
const float dmin = __high2half(x[i].dm);
|
|
49
|
-
|
|
50
|
-
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
|
51
|
-
aux[0] = a[0] & 0x0f0f0f0f;
|
|
52
|
-
aux[1] = a[1] & 0x0f0f0f0f;
|
|
53
|
-
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
|
|
54
|
-
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
|
55
|
-
|
|
56
|
-
float sum1 = 0, sum2 = 0;
|
|
57
|
-
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
|
58
|
-
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
|
59
|
-
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
|
60
|
-
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
|
61
|
-
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
|
|
62
|
-
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
|
|
63
|
-
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
|
|
64
|
-
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
|
|
65
|
-
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
|
|
66
|
-
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
|
|
67
|
-
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
|
|
68
|
-
|
|
69
|
-
}
|
|
70
|
-
tmp += dall * sum1 - dmin * sum2;
|
|
71
|
-
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
// sum up partial sums and write back result
|
|
75
|
-
tmp = warp_reduce_sum(tmp);
|
|
76
|
-
|
|
77
|
-
if (threadIdx.x == 0) {
|
|
78
|
-
dst[row] = tmp;
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
|
83
|
-
|
|
84
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
|
85
|
-
if (row > nrows) return;
|
|
86
|
-
|
|
87
|
-
const int num_blocks_per_row = ncols / QK_K;
|
|
88
|
-
const int ib0 = row*num_blocks_per_row;
|
|
89
|
-
|
|
90
|
-
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
|
91
|
-
|
|
92
|
-
float tmp = 0; // partial sum for thread in warp
|
|
93
|
-
|
|
94
|
-
const uint16_t kmask1 = 0x0303;
|
|
95
|
-
const uint16_t kmask2 = 0x0f0f;
|
|
96
|
-
|
|
97
|
-
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
|
98
|
-
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
|
99
|
-
|
|
100
|
-
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
|
101
|
-
const int step = 16/K_QUANTS_PER_ITERATION;
|
|
102
|
-
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
|
103
|
-
const int in = tid - step*im; // 0....15 or 0...7
|
|
104
|
-
|
|
105
|
-
const uint8_t m = 1 << (4*im);
|
|
106
|
-
|
|
107
|
-
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
|
108
|
-
const int q_offset = 32*im + l0;
|
|
109
|
-
const int y_offset = 128*im + l0;
|
|
110
|
-
|
|
111
|
-
uint16_t utmp[4];
|
|
112
|
-
const int8_t * s = (const int8_t *)utmp;
|
|
113
|
-
|
|
114
|
-
const uint16_t s_shift = 4*im;
|
|
115
|
-
|
|
116
|
-
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
|
117
|
-
|
|
118
|
-
const float * y = yy + i * QK_K + y_offset;
|
|
119
|
-
const uint8_t * q = x[i].qs + q_offset;
|
|
120
|
-
const uint8_t * h = x[i].hmask + l0;
|
|
121
|
-
|
|
122
|
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
|
123
|
-
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
|
|
124
|
-
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
|
|
125
|
-
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
|
|
126
|
-
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
|
|
127
|
-
|
|
128
|
-
const float d = x[i].d;
|
|
129
|
-
|
|
130
|
-
float sum = 0;
|
|
131
|
-
for (int l = 0; l < n; ++l) {
|
|
132
|
-
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
|
|
133
|
-
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
|
|
134
|
-
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
|
|
135
|
-
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
|
|
136
|
-
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
|
|
137
|
-
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
|
|
138
|
-
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
|
|
139
|
-
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
|
|
140
|
-
}
|
|
141
|
-
tmp += d * sum;
|
|
142
|
-
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
// sum up partial sums and write back result
|
|
146
|
-
tmp = warp_reduce_sum(tmp);
|
|
147
|
-
|
|
148
|
-
if (threadIdx.x == 0) {
|
|
149
|
-
dst[row] = tmp;
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
|
154
|
-
|
|
155
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
|
156
|
-
if (row > nrows) return;
|
|
157
|
-
const int num_blocks_per_row = ncols / QK_K;
|
|
158
|
-
const int ib0 = row*num_blocks_per_row;
|
|
159
|
-
|
|
160
|
-
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
|
161
|
-
|
|
162
|
-
const uint16_t kmask1 = 0x3f3f;
|
|
163
|
-
const uint16_t kmask2 = 0x0f0f;
|
|
164
|
-
const uint16_t kmask3 = 0xc0c0;
|
|
165
|
-
|
|
166
|
-
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
|
167
|
-
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
|
168
|
-
|
|
169
|
-
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
|
170
|
-
|
|
171
|
-
const int il = tid/step; // 0...3
|
|
172
|
-
const int ir = tid - step*il; // 0...7 or 0...3
|
|
173
|
-
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
|
174
|
-
|
|
175
|
-
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
|
176
|
-
const int in = il%2;
|
|
177
|
-
|
|
178
|
-
const int l0 = n*(2*ir + in);
|
|
179
|
-
const int q_offset = 32*im + l0;
|
|
180
|
-
const int y_offset = 64*im + l0;
|
|
181
|
-
|
|
182
|
-
uint16_t aux[4];
|
|
183
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
|
184
|
-
|
|
185
|
-
#if K_QUANTS_PER_ITERATION == 2
|
|
186
|
-
uint32_t q32[4];
|
|
187
|
-
const uint8_t * q4 = (const uint8_t *)q32;
|
|
188
|
-
#else
|
|
189
|
-
uint16_t q16[4];
|
|
190
|
-
const uint8_t * q4 = (const uint8_t *)q16;
|
|
191
|
-
#endif
|
|
192
|
-
|
|
193
|
-
float tmp = 0; // partial sum for thread in warp
|
|
194
|
-
|
|
195
|
-
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
|
196
|
-
|
|
197
|
-
const float * y1 = yy + i*QK_K + y_offset;
|
|
198
|
-
const float * y2 = y1 + 128;
|
|
199
|
-
|
|
200
|
-
const float dall = __low2half(x[i].dm);
|
|
201
|
-
const float dmin = __high2half(x[i].dm);
|
|
202
|
-
|
|
203
|
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
|
204
|
-
aux[0] = a[im+0] & kmask1;
|
|
205
|
-
aux[1] = a[im+2] & kmask1;
|
|
206
|
-
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
|
207
|
-
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
|
208
|
-
|
|
209
|
-
#if K_QUANTS_PER_ITERATION == 2
|
|
210
|
-
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
|
211
|
-
const uint32_t * q2 = q1 + 16;
|
|
212
|
-
|
|
213
|
-
q32[0] = q1[0] & 0x0f0f0f0f;
|
|
214
|
-
q32[1] = q1[0] & 0xf0f0f0f0;
|
|
215
|
-
q32[2] = q2[0] & 0x0f0f0f0f;
|
|
216
|
-
q32[3] = q2[0] & 0xf0f0f0f0;
|
|
217
|
-
|
|
218
|
-
float4 s = {0.f, 0.f, 0.f, 0.f};
|
|
219
|
-
float smin = 0;
|
|
220
|
-
for (int l = 0; l < 4; ++l) {
|
|
221
|
-
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
|
|
222
|
-
s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
|
|
223
|
-
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
|
224
|
-
}
|
|
225
|
-
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
|
226
|
-
#else
|
|
227
|
-
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
|
228
|
-
const uint16_t * q2 = q1 + 32;
|
|
229
|
-
|
|
230
|
-
q16[0] = q1[0] & 0x0f0f;
|
|
231
|
-
q16[1] = q1[0] & 0xf0f0;
|
|
232
|
-
q16[2] = q2[0] & 0x0f0f;
|
|
233
|
-
q16[3] = q2[0] & 0xf0f0;
|
|
234
|
-
|
|
235
|
-
float4 s = {0.f, 0.f, 0.f, 0.f};
|
|
236
|
-
float smin = 0;
|
|
237
|
-
for (int l = 0; l < 2; ++l) {
|
|
238
|
-
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
|
239
|
-
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
|
240
|
-
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
|
241
|
-
}
|
|
242
|
-
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
|
243
|
-
#endif
|
|
244
|
-
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
// sum up partial sums and write back result
|
|
248
|
-
tmp = warp_reduce_sum(tmp);
|
|
249
|
-
|
|
250
|
-
if (tid == 0) {
|
|
251
|
-
dst[row] = tmp;
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
|
256
|
-
|
|
257
|
-
const int row = blockIdx.x;
|
|
258
|
-
const int num_blocks_per_row = ncols / QK_K;
|
|
259
|
-
const int ib0 = row*num_blocks_per_row;
|
|
260
|
-
|
|
261
|
-
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
|
262
|
-
|
|
263
|
-
float tmp = 0; // partial sum for thread in warp
|
|
264
|
-
|
|
265
|
-
const uint16_t kmask1 = 0x3f3f;
|
|
266
|
-
const uint16_t kmask2 = 0x0f0f;
|
|
267
|
-
const uint16_t kmask3 = 0xc0c0;
|
|
268
|
-
|
|
269
|
-
const int tid = threadIdx.x/2; // 0...15
|
|
270
|
-
const int ix = threadIdx.x%2;
|
|
271
|
-
|
|
272
|
-
const int il = tid/4; // 0...3
|
|
273
|
-
const int ir = tid - 4*il;// 0...3
|
|
274
|
-
const int n = 2;
|
|
275
|
-
|
|
276
|
-
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
|
277
|
-
const int in = il%2;
|
|
278
|
-
|
|
279
|
-
const int l0 = n*(2*ir + in);
|
|
280
|
-
const int q_offset = 32*im + l0;
|
|
281
|
-
const int y_offset = 64*im + l0;
|
|
282
|
-
|
|
283
|
-
const uint8_t hm1 = 1 << (2*im);
|
|
284
|
-
const uint8_t hm2 = hm1 << 4;
|
|
285
|
-
|
|
286
|
-
uint16_t aux[4];
|
|
287
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
|
288
|
-
|
|
289
|
-
uint16_t q16[8];
|
|
290
|
-
const uint8_t * q4 = (const uint8_t *)q16;
|
|
291
|
-
|
|
292
|
-
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
|
293
|
-
|
|
294
|
-
const uint8_t * ql1 = x[i].qs + q_offset;
|
|
295
|
-
const uint8_t * qh = x[i].qh + l0;
|
|
296
|
-
const float * y1 = yy + i*QK_K + y_offset;
|
|
297
|
-
const float * y2 = y1 + 128;
|
|
298
|
-
|
|
299
|
-
const float dall = __low2half(x[i].dm);
|
|
300
|
-
const float dmin = __high2half(x[i].dm);
|
|
301
|
-
|
|
302
|
-
const uint16_t * a = (const uint16_t *)x[i].scales;
|
|
303
|
-
aux[0] = a[im+0] & kmask1;
|
|
304
|
-
aux[1] = a[im+2] & kmask1;
|
|
305
|
-
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
|
306
|
-
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
|
307
|
-
|
|
308
|
-
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
|
309
|
-
float smin = 0;
|
|
310
|
-
const uint16_t * q1 = (const uint16_t *)ql1;
|
|
311
|
-
const uint16_t * q2 = q1 + 32;
|
|
312
|
-
q16[0] = q1[0] & 0x0f0f;
|
|
313
|
-
q16[1] = q1[8] & 0x0f0f;
|
|
314
|
-
q16[2] = (q1[0] >> 4) & 0x0f0f;
|
|
315
|
-
q16[3] = (q1[8] >> 4) & 0x0f0f;
|
|
316
|
-
q16[4] = q2[0] & 0x0f0f;
|
|
317
|
-
q16[5] = q2[8] & 0x0f0f;
|
|
318
|
-
q16[6] = (q2[0] >> 4) & 0x0f0f;
|
|
319
|
-
q16[7] = (q2[8] >> 4) & 0x0f0f;
|
|
320
|
-
for (int l = 0; l < n; ++l) {
|
|
321
|
-
sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
|
322
|
-
+ y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
|
323
|
-
sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
|
324
|
-
+ y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
|
325
|
-
sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
|
326
|
-
+ y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
|
327
|
-
sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
|
328
|
-
+ y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
|
329
|
-
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
|
330
|
-
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
|
331
|
-
}
|
|
332
|
-
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
// sum up partial sums and write back result
|
|
336
|
-
tmp = warp_reduce_sum(tmp);
|
|
337
|
-
|
|
338
|
-
if (threadIdx.x == 0) {
|
|
339
|
-
dst[row] = tmp;
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
|
344
|
-
|
|
345
|
-
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
|
346
|
-
|
|
347
|
-
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
|
348
|
-
if (row > nrows) return;
|
|
349
|
-
|
|
350
|
-
const int num_blocks_per_row = ncols / QK_K;
|
|
351
|
-
const int ib0 = row*num_blocks_per_row;
|
|
352
|
-
|
|
353
|
-
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
|
354
|
-
|
|
355
|
-
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
|
356
|
-
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
|
357
|
-
|
|
358
|
-
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
|
359
|
-
|
|
360
|
-
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
|
361
|
-
const int in = tid - step*im; // 0...15 or 0...7
|
|
362
|
-
|
|
363
|
-
#if K_QUANTS_PER_ITERATION == 1
|
|
364
|
-
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
|
365
|
-
const int is = 0;
|
|
366
|
-
#else
|
|
367
|
-
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
|
368
|
-
const int is = in / 4;
|
|
369
|
-
#endif
|
|
370
|
-
const int ql_offset = 64*im + l0;
|
|
371
|
-
const int qh_offset = 32*im + l0;
|
|
372
|
-
const int s_offset = 8*im + is;
|
|
373
|
-
const int y_offset = 128*im + l0;
|
|
374
|
-
|
|
375
|
-
float tmp = 0; // partial sum for thread in warp
|
|
376
|
-
|
|
377
|
-
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
|
378
|
-
|
|
379
|
-
const float * y = yy + i * QK_K + y_offset;
|
|
380
|
-
const uint8_t * ql = x[i].ql + ql_offset;
|
|
381
|
-
const uint8_t * qh = x[i].qh + qh_offset;
|
|
382
|
-
const int8_t * s = x[i].scales + s_offset;
|
|
383
|
-
|
|
384
|
-
const float d = x[i].d;
|
|
385
|
-
|
|
386
|
-
#if K_QUANTS_PER_ITERATION == 1
|
|
387
|
-
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
|
388
|
-
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
|
389
|
-
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
|
390
|
-
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
|
391
|
-
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
|
392
|
-
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
|
393
|
-
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
|
394
|
-
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
|
395
|
-
tmp += sum;
|
|
396
|
-
#else
|
|
397
|
-
float sum = 0;
|
|
398
|
-
for (int l = 0; l < 4; ++l) {
|
|
399
|
-
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
|
400
|
-
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
|
|
401
|
-
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
|
|
402
|
-
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
|
403
|
-
}
|
|
404
|
-
tmp += sum;
|
|
405
|
-
#endif
|
|
406
|
-
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
// sum up partial sums and write back result
|
|
410
|
-
tmp = warp_reduce_sum(tmp);
|
|
411
|
-
|
|
412
|
-
if (tid == 0) {
|
|
413
|
-
dst[row] = tmp;
|
|
414
|
-
}
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
|
418
|
-
const half * x = (const half *) vx;
|
|
419
|
-
|
|
420
|
-
// automatic half -> float type cast if dfloat == float
|
|
421
|
-
v.x = x[ib + iqs + 0];
|
|
422
|
-
v.y = x[ib + iqs + 1];
|
|
423
|
-
}
|
|
424
|
-
|
|
425
|
-
static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
|
|
426
|
-
return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
|
|
427
|
-
type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
|
|
428
|
-
type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
|
|
429
|
-
type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
|
|
430
|
-
type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
|
|
431
|
-
type == GGML_TYPE_F16 ? convert_f16 :
|
|
432
|
-
nullptr;
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
template <ggml_type type>
|
|
436
|
-
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
|
437
|
-
constexpr int qk = ggml_cuda_type_traits<type>::qk; // quantized weights per x block
|
|
438
|
-
constexpr int qr = ggml_cuda_type_traits<type>::qr; // number of quantized weights per data value in x block
|
|
439
|
-
constexpr dequantize_kernel_t dequantize_kernel = get_dequantize_kernel(type);
|
|
440
|
-
|
|
441
|
-
const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
|
|
442
|
-
|
|
443
|
-
if (row >= nrows) {
|
|
444
|
-
return;
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
const int tid = threadIdx.x;
|
|
448
|
-
|
|
449
|
-
const int iter_stride = 2*GGML_CUDA_DMMV_X;
|
|
450
|
-
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
|
|
451
|
-
const int y_offset = qr == 1 ? 1 : qk/2;
|
|
452
|
-
|
|
453
|
-
// partial sum for each thread
|
|
454
|
-
#ifdef GGML_CUDA_F16
|
|
455
|
-
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
|
456
|
-
#else
|
|
457
|
-
float tmp = 0.0f;
|
|
458
|
-
#endif // GGML_CUDA_F16
|
|
459
|
-
|
|
460
|
-
for (int i = 0; i < ncols; i += iter_stride) {
|
|
461
|
-
const int col = i + vals_per_iter*tid;
|
|
462
|
-
const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
|
|
463
|
-
const int iqs = (col%qk)/qr; // x quant index
|
|
464
|
-
const int iybs = col - col%qk; // y block start index
|
|
465
|
-
|
|
466
|
-
// processing >2 values per i iter is faster for fast GPUs
|
|
467
|
-
#pragma unroll
|
|
468
|
-
for (int j = 0; j < vals_per_iter; j += 2) {
|
|
469
|
-
// process 2 vals per j iter
|
|
470
|
-
|
|
471
|
-
// dequantize
|
|
472
|
-
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
|
473
|
-
dfloat2 v;
|
|
474
|
-
dequantize_kernel(vx, ib, iqs + j/qr, v);
|
|
475
|
-
|
|
476
|
-
// matrix multiplication
|
|
477
|
-
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
|
478
|
-
#ifdef GGML_CUDA_F16
|
|
479
|
-
tmp += __hmul2(v, {
|
|
480
|
-
y[iybs + iqs + j/qr + 0],
|
|
481
|
-
y[iybs + iqs + j/qr + y_offset]
|
|
482
|
-
});
|
|
483
|
-
#else
|
|
484
|
-
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
|
485
|
-
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
|
486
|
-
#endif // GGML_CUDA_F16
|
|
487
|
-
}
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
// sum up partial sums and write back result
|
|
491
|
-
tmp = warp_reduce_sum(tmp);
|
|
492
|
-
|
|
493
|
-
if (tid == 0) {
|
|
494
|
-
#ifdef GGML_CUDA_F16
|
|
495
|
-
dst[row] = tmp.x + tmp.y;
|
|
496
|
-
#else
|
|
497
|
-
dst[row] = tmp;
|
|
498
|
-
#endif // GGML_CUDA_F16
|
|
499
|
-
}
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
503
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
|
504
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
505
|
-
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
|
506
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
507
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
508
|
-
dequantize_mul_mat_vec<GGML_TYPE_Q4_0>
|
|
509
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
513
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
|
514
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
515
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
516
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
517
|
-
dequantize_mul_mat_vec<GGML_TYPE_Q4_1>
|
|
518
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
522
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
|
523
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
524
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
525
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
526
|
-
dequantize_mul_mat_vec<GGML_TYPE_Q5_0>
|
|
527
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
528
|
-
}
|
|
529
|
-
|
|
530
|
-
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
531
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
|
532
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
533
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
534
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
535
|
-
dequantize_mul_mat_vec<GGML_TYPE_Q5_1>
|
|
536
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
537
|
-
}
|
|
538
|
-
|
|
539
|
-
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
540
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
|
541
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
542
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
543
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
544
|
-
dequantize_mul_mat_vec<GGML_TYPE_Q8_0>
|
|
545
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
546
|
-
}
|
|
547
|
-
|
|
548
|
-
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
549
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
550
|
-
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
|
551
|
-
const int block_num_y = (nrows + ny - 1) / ny;
|
|
552
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
553
|
-
const dim3 block_dims(32, ny, 1);
|
|
554
|
-
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
555
|
-
}
|
|
556
|
-
|
|
557
|
-
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
558
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
559
|
-
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
|
560
|
-
const int block_num_y = (nrows + ny - 1) / ny;
|
|
561
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
562
|
-
const dim3 block_dims(32, ny, 1);
|
|
563
|
-
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
567
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
568
|
-
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
|
569
|
-
const int block_num_y = (nrows + ny - 1) / ny;
|
|
570
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
571
|
-
const dim3 block_dims(32, ny, 1);
|
|
572
|
-
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
576
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
577
|
-
const dim3 block_dims(32, 1, 1);
|
|
578
|
-
dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
|
579
|
-
}
|
|
580
|
-
|
|
581
|
-
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
582
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
583
|
-
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
|
584
|
-
const int block_num_y = (nrows + ny - 1) / ny;
|
|
585
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
586
|
-
const dim3 block_dims(32, ny, 1);
|
|
587
|
-
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
588
|
-
}
|
|
589
|
-
|
|
590
|
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
591
|
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
|
592
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
593
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
|
594
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
595
|
-
dequantize_mul_mat_vec<GGML_TYPE_F16>
|
|
596
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
|
597
|
-
}
|
|
598
|
-
|
|
599
|
-
void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
600
|
-
ggml_backend_cuda_context & ctx,
|
|
601
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
602
|
-
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
603
|
-
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
|
604
|
-
GGML_UNUSED(ctx);
|
|
605
|
-
const int64_t ne00 = src0->ne[0];
|
|
606
|
-
const int64_t row_diff = row_high - row_low;
|
|
607
|
-
|
|
608
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
609
|
-
|
|
610
|
-
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
|
611
|
-
#ifdef GGML_CUDA_F16
|
|
612
|
-
ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
|
|
613
|
-
half * src1_dfloat = nullptr; // dfloat == half
|
|
614
|
-
|
|
615
|
-
bool src1_convert_f16 =
|
|
616
|
-
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
|
617
|
-
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
|
618
|
-
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
|
619
|
-
|
|
620
|
-
if (src1_convert_f16) {
|
|
621
|
-
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
|
622
|
-
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
|
623
|
-
GGML_ASSERT(to_fp16_cuda != nullptr);
|
|
624
|
-
to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
|
|
625
|
-
}
|
|
626
|
-
#else
|
|
627
|
-
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
|
628
|
-
#endif // GGML_CUDA_F16
|
|
629
|
-
|
|
630
|
-
switch (src0->type) {
|
|
631
|
-
case GGML_TYPE_Q4_0:
|
|
632
|
-
dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
|
633
|
-
break;
|
|
634
|
-
case GGML_TYPE_Q4_1:
|
|
635
|
-
dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
|
636
|
-
break;
|
|
637
|
-
case GGML_TYPE_Q5_0:
|
|
638
|
-
dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
|
639
|
-
break;
|
|
640
|
-
case GGML_TYPE_Q5_1:
|
|
641
|
-
dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
|
642
|
-
break;
|
|
643
|
-
case GGML_TYPE_Q8_0:
|
|
644
|
-
dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
|
645
|
-
break;
|
|
646
|
-
case GGML_TYPE_Q2_K:
|
|
647
|
-
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
|
648
|
-
break;
|
|
649
|
-
case GGML_TYPE_Q3_K:
|
|
650
|
-
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
|
651
|
-
break;
|
|
652
|
-
case GGML_TYPE_Q4_K:
|
|
653
|
-
dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
|
654
|
-
break;
|
|
655
|
-
case GGML_TYPE_Q5_K:
|
|
656
|
-
dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
|
657
|
-
break;
|
|
658
|
-
case GGML_TYPE_Q6_K:
|
|
659
|
-
dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
|
660
|
-
break;
|
|
661
|
-
case GGML_TYPE_F16:
|
|
662
|
-
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
|
663
|
-
break;
|
|
664
|
-
default:
|
|
665
|
-
GGML_ASSERT(false);
|
|
666
|
-
break;
|
|
667
|
-
}
|
|
668
|
-
|
|
669
|
-
GGML_UNUSED(src1);
|
|
670
|
-
GGML_UNUSED(dst);
|
|
671
|
-
GGML_UNUSED(src1_ddq_i);
|
|
672
|
-
GGML_UNUSED(src1_ncols);
|
|
673
|
-
GGML_UNUSED(src1_padded_row_size);
|
|
674
|
-
}
|